mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Add client example of grouped conv2d forward (data type: fp16) (#488)
* Rename example folder for GroupedConvFwdMultipleD
* Unify example codes
* Change target names
* Add fp16 example for multiple d instance
* Re-format common.hpp
* Add interface 'DeviceGroupedConvFwd'
* Use simpler interface
* Move common conv params out
* Rename conv fwd client example folder
* Add missing include directive
* Update grouped conv instance implementations
* Simplify ckProfiler (grouped conv forward)
* Use GroupedConvFwd to implement client example
* Use greater groupe count in example
* Add custom target to group examples
* Add extra tag param to instance factory function
* Use tag to differentiate factory functions
* Add missing tag argument for factory function
* Remove inheritance relationship
* Remove no-longer used include directive
* Add license in front of file
[ROCm/composable_kernel commit: f49803101e]
This commit is contained in:
@@ -1,2 +0,0 @@
|
||||
add_executable(client_conv2d_fwd conv2d_fwd.cpp)
|
||||
target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations)
|
||||
2
client_example/07_grouped_conv2d_fwd/CMakeLists.txt
Normal file
2
client_example/07_grouped_conv2d_fwd/CMakeLists.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
|
||||
target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
|
||||
@@ -1,35 +1,38 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = ck::half_t;
|
||||
using WeiDataType = ck::half_t;
|
||||
using OutDataType = ck::half_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::KYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NHWK;
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNHWK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t N = 16;
|
||||
static constexpr ck::index_t K = 32;
|
||||
static constexpr ck::index_t C = 3;
|
||||
static constexpr ck::index_t G = 32;
|
||||
static constexpr ck::index_t N = 256;
|
||||
static constexpr ck::index_t K = 192;
|
||||
static constexpr ck::index_t C = 192;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 224;
|
||||
static constexpr ck::index_t Wi = 224;
|
||||
static constexpr ck::index_t Ho = 113;
|
||||
static constexpr ck::index_t Wo = 113;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 28;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -47,30 +50,66 @@ struct SimpleDeviceMem
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
int main()
|
||||
{
|
||||
std::vector<ck::index_t> in_spatial_lengths{Hi, Wi};
|
||||
std::vector<ck::index_t> filter_spatial_lengths{Y, X};
|
||||
std::vector<ck::index_t> out_spatial_lengths{Ho, Wo};
|
||||
std::vector<ck::index_t> filter_strides{2, 2};
|
||||
std::vector<ck::index_t> filter_dilations{1, 1};
|
||||
std::vector<ck::index_t> input_left_pads{2, 2};
|
||||
std::vector<ck::index_t> input_right_pads{2, 2};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
|
||||
std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
|
||||
|
||||
std::partial_sum(rbegin(in_lengths),
|
||||
std::prev(rend(in_lengths)),
|
||||
std::next(rbegin(in_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(wei_lengths),
|
||||
std::prev(rend(wei_lengths)),
|
||||
std::next(rbegin(wei_strides)),
|
||||
std::multiplies<>{});
|
||||
std::partial_sum(rbegin(out_lengths),
|
||||
std::prev(rend(out_lengths)),
|
||||
std::next(rbegin(out_strides)),
|
||||
std::multiplies<>{});
|
||||
|
||||
// transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
|
||||
std::rotate(
|
||||
rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
|
||||
std::rotate(
|
||||
rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
|
||||
std::rotate(
|
||||
rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
|
||||
std::rotate(
|
||||
rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
|
||||
std::rotate(
|
||||
rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
|
||||
std::rotate(
|
||||
rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
|
||||
|
||||
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
|
||||
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
@@ -91,13 +130,16 @@ int main(int argc, char* argv[])
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
out_spatial_lengths,
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -112,10 +154,10 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * K * Y * X * C +
|
||||
sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
|
||||
sizeof(WeiDataType) * G * K * Y * X * C +
|
||||
sizeof(OutDataType) * G * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
@@ -134,10 +176,16 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
std::cerr << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id < 0)
|
||||
{
|
||||
std::cerr << "no suitable instance" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
@@ -148,13 +196,16 @@ int main(int argc, char* argv[])
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
in_spatial_lengths,
|
||||
filter_spatial_lengths,
|
||||
out_spatial_lengths,
|
||||
in_lengths,
|
||||
in_strides,
|
||||
wei_lengths,
|
||||
wei_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
filter_strides,
|
||||
filter_dilations,
|
||||
input_left_pads,
|
||||
@@ -172,6 +223,4 @@ int main(int argc, char* argv[])
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
22
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
Normal file
22
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
add_custom_target(example_grouped_conv_fwd_multiple_d)
|
||||
|
||||
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
|
||||
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
|
||||
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
|
||||
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
|
||||
|
||||
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
|
||||
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
|
||||
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
|
||||
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
|
||||
|
||||
if(USE_BITINT_EXTENSION_INT4)
|
||||
add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
|
||||
|
||||
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
|
||||
endif() # USE_BITINT_EXTENSION_INT4
|
||||
|
||||
|
||||
add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
|
||||
|
||||
add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
|
||||
30
example/30_grouped_conv_fwd_multiple_d/README.md
Normal file
30
example/30_grouped_conv_fwd_multiple_d/README.md
Normal file
@@ -0,0 +1,30 @@
|
||||
Command
|
||||
```bash
|
||||
arg1: verification (0=no, 1=yes)
|
||||
arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
arg3: time kernel (0=no, 1=yes)
|
||||
Following arguments (depending on number of spatial dims):
|
||||
Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
|
||||
G, N, K, C,
|
||||
<filter spatial dimensions>, (ie Y, X for 2D)
|
||||
<input image spatial dimensions>, (ie Hi, Wi for 2D)
|
||||
<strides>, (ie Sy, Sx for 2D)
|
||||
<dilations>, (ie Dy, Dx for 2D)
|
||||
<left padding>, (ie LeftPy, LeftPx for 2D)
|
||||
<right padding>, (ie RightPy, RightPx for 2D)
|
||||
|
||||
./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
|
||||
```
|
||||
|
||||
Result (MI100)
|
||||
```
|
||||
in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
|
||||
wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
|
||||
bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
|
||||
residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
|
||||
out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
|
||||
launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
|
||||
```
|
||||
354
example/30_grouped_conv_fwd_multiple_d/common.hpp
Normal file
354
example/30_grouped_conv_fwd_multiple_d/common.hpp
Normal file
@@ -0,0 +1,354 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using FP16 = ck::half_t;
|
||||
using FP32 = float;
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
using I4 = ck::int4_t;
|
||||
#endif
|
||||
using I8 = std::int8_t;
|
||||
using I32 = std::int32_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <typename InputLay, typename WeightLay, typename OutputLay>
|
||||
struct CommonLayoutSetting
|
||||
{
|
||||
using InputLayout = InputLay;
|
||||
using WeightLayout = WeightLay;
|
||||
using OutputLayout = OutputLay;
|
||||
};
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
struct CommonLayoutSettingSelector;
|
||||
|
||||
namespace ctl = ck::tensor_layout::convolution;
|
||||
|
||||
template <>
|
||||
struct CommonLayoutSettingSelector<1> final
|
||||
: CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CommonLayoutSettingSelector<2> final
|
||||
: CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct CommonLayoutSettingSelector<3> final
|
||||
: CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
|
||||
{
|
||||
};
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
|
||||
|
||||
struct ExecutionConfig final
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = true;
|
||||
};
|
||||
|
||||
#define DefaultConvParam \
|
||||
ck::utils::conv::ConvParam \
|
||||
{ \
|
||||
2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
|
||||
}
|
||||
|
||||
inline void print_help_msg()
|
||||
{
|
||||
std::cerr << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: time kernel (0=no, 1=yes)\n"
|
||||
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
|
||||
}
|
||||
|
||||
inline bool parse_cmd_args(int argc,
|
||||
char* argv[],
|
||||
ExecutionConfig& config,
|
||||
ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
constexpr int num_execution_config_args =
|
||||
3; // arguments for do_verification, init_method, time_kernel
|
||||
constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
|
||||
|
||||
constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
|
||||
constexpr int threshold_to_catch_all_args =
|
||||
threshold_to_catch_partial_args + num_conv_param_leading_args;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
// catch only ExecutionConfig arguments
|
||||
else if(argc == threshold_to_catch_partial_args)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
// catch both ExecutionConfig & ConvParam arguments
|
||||
else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
conv_param = ck::utils::conv::parse_conv_param(
|
||||
num_dim_spatial, threshold_to_catch_partial_args, argv);
|
||||
}
|
||||
else
|
||||
{
|
||||
print_help_msg();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
switch(conv_param.num_dim_spatial_)
|
||||
{
|
||||
case 1:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1],
|
||||
conv_param.input_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
}
|
||||
|
||||
inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
switch(conv_param.num_dim_spatial_)
|
||||
{
|
||||
case 1:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1],
|
||||
conv_param.filter_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
}
|
||||
|
||||
inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
switch(conv_param.num_dim_spatial_)
|
||||
{
|
||||
case 1:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
case 2:
|
||||
return HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
case 3:
|
||||
return HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
}
|
||||
|
||||
inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
|
||||
switch(conv_param.num_dim_spatial_)
|
||||
{
|
||||
case 1:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
case 2:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
case 3:
|
||||
return HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
}
|
||||
|
||||
throw std::runtime_error("unsuppored # dim spatial");
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = BF16;
|
||||
using WeiKernelDataType = BF16;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP32;
|
||||
using BiasKernelDataType = BF16;
|
||||
using ResidualKernelDataType = BF16;
|
||||
using OutKernelDataType = BF16;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = FP16;
|
||||
using WeiKernelDataType = FP16;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP16;
|
||||
using BiasKernelDataType = FP16;
|
||||
using ResidualKernelDataType = FP16;
|
||||
using OutKernelDataType = FP16;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = FP32;
|
||||
using WeiKernelDataType = FP32;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP32;
|
||||
using BiasKernelDataType = FP32;
|
||||
using ResidualKernelDataType = FP32;
|
||||
using OutKernelDataType = FP32;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
|
||||
@@ -0,0 +1,31 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
#error Should compile this file with ck::int4_t support
|
||||
#endif
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = I8;
|
||||
using WeiKernelDataType = I8;
|
||||
using AccDataType = I32;
|
||||
using CShuffleDataType = I8;
|
||||
using BiasKernelDataType = I8;
|
||||
using ResidualKernelDataType = I8;
|
||||
using OutKernelDataType = I8;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = I4;
|
||||
using WeiUserDataType = I4;
|
||||
using OutUserDataType = I4;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
#define BUILD_INT4_EXAMPLE
|
||||
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = I8;
|
||||
using WeiKernelDataType = I8;
|
||||
using AccDataType = I32;
|
||||
using CShuffleDataType = I8;
|
||||
using BiasKernelDataType = I8;
|
||||
using ResidualKernelDataType = I8;
|
||||
using OutKernelDataType = I8;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
|
||||
@@ -0,0 +1,24 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = FP16;
|
||||
using WeiKernelDataType = FP16;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP16;
|
||||
using OutKernelDataType = FP16;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = PassThrough;
|
||||
|
||||
#include "run_grouped_conv_fwd_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
|
||||
@@ -1,59 +1,110 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
void print_helper_msg()
|
||||
template <typename BiasLay, typename ResidualLay>
|
||||
struct LayoutSetting
|
||||
{
|
||||
std::cout << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: time kernel (0=no, 1=yes)\n"
|
||||
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
|
||||
}
|
||||
using BiasLayout = BiasLay;
|
||||
using ResidualLayout = ResidualLay;
|
||||
};
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InKernelDataType,
|
||||
typename WeiKernelDataType,
|
||||
typename CShuffleDataType,
|
||||
typename OutKernelDataType,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp,
|
||||
typename InUserDataType,
|
||||
typename WeiUserDataType,
|
||||
typename OutUserDataType,
|
||||
typename DeviceConvNDFwdInstance>
|
||||
int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
int init_method,
|
||||
bool time_kernel,
|
||||
const ck::utils::conv::ConvParam& conv_param,
|
||||
const HostTensorDescriptor& in_g_n_c_wis_desc,
|
||||
const HostTensorDescriptor& wei_g_k_c_xs_desc,
|
||||
const HostTensorDescriptor& bias_g_n_k_wos_desc,
|
||||
const HostTensorDescriptor& residual_g_n_k_wos_desc,
|
||||
const HostTensorDescriptor& out_g_n_k_wos_desc,
|
||||
const InElementOp& in_element_op,
|
||||
const WeiElementOp& wei_element_op,
|
||||
const OutElementOp& out_element_op)
|
||||
template <ck::index_t NDimSpatial>
|
||||
struct LayoutSettingSelector;
|
||||
|
||||
template <>
|
||||
struct LayoutSettingSelector<1> final : LayoutSetting<ctl::G_K, ctl::G_NW_K>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct LayoutSettingSelector<2> final : LayoutSetting<ctl::G_K, ctl::G_NHW_K>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct LayoutSettingSelector<3> final : LayoutSetting<ctl::G_K, ctl::G_NDHW_K>
|
||||
{
|
||||
};
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using DeviceConvFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InputLayout<NDimSpatial>,
|
||||
WeightLayout<NDimSpatial>,
|
||||
ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
|
||||
OutputLayout<NDimSpatial>,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
16, // KPerBlock
|
||||
4, // AK1
|
||||
4, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
4, // ABlockTransferSrcScalarPerVector
|
||||
4, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
4, // BBlockTransferSrcScalarPerVector
|
||||
4, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 16, 1, 16>,
|
||||
4>;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
CShuffleDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
PassThrough>;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
|
||||
const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
|
||||
|
||||
const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param);
|
||||
const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param);
|
||||
const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param);
|
||||
const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
|
||||
|
||||
Tensor<InUserDataType> in(in_g_n_c_wis_desc);
|
||||
Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
|
||||
Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
|
||||
Tensor<OutUserDataType> residual(residual_g_n_k_wos_desc);
|
||||
Tensor<OutUserDataType> residual(bias_g_n_k_wos_desc);
|
||||
Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
|
||||
Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
|
||||
|
||||
@@ -63,7 +114,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
std::cout << "residual: " << residual.mDesc << std::endl;
|
||||
std::cout << "out: " << out_host.mDesc << std::endl;
|
||||
|
||||
switch(init_method)
|
||||
switch(config.init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
@@ -83,7 +134,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
|
||||
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
const Tensor<InKernelDataType> in_converted(in);
|
||||
const Tensor<WeiKernelDataType> wei_converted(wei);
|
||||
const Tensor<OutKernelDataType> bias_converted(bias);
|
||||
@@ -93,12 +144,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
wei_device_buf.ToDevice(wei_converted.mData.data());
|
||||
bias_device_buf.ToDevice(bias_converted.mData.data());
|
||||
residual_device_buf.ToDevice(residual_converted.mData.data());
|
||||
#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
#else
|
||||
in_device_buf.ToDevice(in.mData.data());
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
bias_device_buf.ToDevice(bias.mData.data());
|
||||
residual_device_buf.ToDevice(residual.mData.data());
|
||||
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
#endif
|
||||
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
|
||||
@@ -123,8 +174,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
|
||||
copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
|
||||
copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
|
||||
copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
|
||||
copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
|
||||
copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
|
||||
copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
|
||||
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
|
||||
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
|
||||
copy(conv_param.conv_filter_strides_, conv_filter_strides);
|
||||
@@ -133,7 +184,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
copy(conv_param.input_right_pads_, input_right_pads);
|
||||
|
||||
// do Conv
|
||||
auto conv = DeviceConvNDFwdInstance{};
|
||||
auto conv = DeviceConvFwdInstance<NDimSpatial>{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument =
|
||||
conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
|
||||
@@ -155,9 +206,9 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
@@ -166,7 +217,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
|
||||
@@ -176,20 +227,11 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< conv.GetTypeString() << std::endl;
|
||||
|
||||
if(do_verification)
|
||||
if(config.do_verification)
|
||||
{
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
CShuffleDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
PassThrough>();
|
||||
|
||||
auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in,
|
||||
wei,
|
||||
@@ -198,36 +240,49 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
// TODO: implement elementwise operation for host
|
||||
out_host.ForEach([&](auto&, auto idx) {
|
||||
out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx));
|
||||
OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx));
|
||||
});
|
||||
|
||||
out_device_buf.FromDevice(out_device.mData.data());
|
||||
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
const Tensor<OutUserDataType> out_device_converted(out_device);
|
||||
|
||||
return ck::utils::check_err(out_device_converted.mData,
|
||||
out_host.mData,
|
||||
"Error: incorrect results!",
|
||||
1e-5f,
|
||||
1e-4f)
|
||||
? 0
|
||||
: 1;
|
||||
#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
return ck::utils::check_err(
|
||||
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f)
|
||||
? 0
|
||||
: 1;
|
||||
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
#else
|
||||
return ck::utils::check_err(
|
||||
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[])
|
||||
{
|
||||
ExecutionConfig config;
|
||||
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
|
||||
|
||||
if(!parse_cmd_args(argc, argv, config, conv_param))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
switch(conv_param.num_dim_spatial_)
|
||||
{
|
||||
case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param);
|
||||
case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
|
||||
case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using DeviceConvFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InputLayout<NDimSpatial>,
|
||||
WeightLayout<NDimSpatial>,
|
||||
ck::Tuple<>,
|
||||
OutputLayout<NDimSpatial>,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
16, // KPerBlock
|
||||
4, // AK1
|
||||
4, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
4, // ABlockTransferSrcScalarPerVector
|
||||
4, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
4, // BBlockTransferSrcScalarPerVector
|
||||
4, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 16, 1, 16>,
|
||||
4>;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
CShuffleDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
PassThrough>;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
bool run_grouped_conv_fwd(const ExecutionConfig& config,
|
||||
const ck::utils::conv::ConvParam& conv_param)
|
||||
{
|
||||
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
|
||||
|
||||
const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param);
|
||||
const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param);
|
||||
const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
|
||||
|
||||
Tensor<InUserDataType> in(in_g_n_c_wis_desc);
|
||||
Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
|
||||
Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
|
||||
Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
|
||||
|
||||
std::cout << "in: " << in.mDesc << std::endl;
|
||||
std::cout << "wei: " << wei.mDesc << std::endl;
|
||||
std::cout << "out: " << out_host.mDesc << std::endl;
|
||||
|
||||
switch(config.init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
|
||||
wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
|
||||
break;
|
||||
default:
|
||||
in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
|
||||
wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
|
||||
}
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
|
||||
DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
|
||||
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
const Tensor<InKernelDataType> in_converted(in);
|
||||
const Tensor<WeiKernelDataType> wei_converted(wei);
|
||||
|
||||
in_device_buf.ToDevice(in_converted.mData.data());
|
||||
wei_device_buf.ToDevice(wei_converted.mData.data());
|
||||
#else
|
||||
in_device_buf.ToDevice(in.mData.data());
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
#endif
|
||||
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
std::array<ck::index_t, NDimSpatial> input_right_pads{};
|
||||
|
||||
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
|
||||
|
||||
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
|
||||
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
|
||||
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
|
||||
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
|
||||
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
|
||||
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
|
||||
copy(conv_param.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
|
||||
copy(conv_param.input_left_pads_, input_left_pads);
|
||||
copy(conv_param.input_right_pads_, input_right_pads);
|
||||
|
||||
// do Conv
|
||||
auto conv = DeviceConvFwdInstance<NDimSpatial>{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
|
||||
wei_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
a_g_n_c_wis_lengths,
|
||||
a_g_n_c_wis_strides,
|
||||
b_g_k_c_xs_lengths,
|
||||
b_g_k_c_xs_strides,
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
|
||||
e_g_n_k_wos_lengths,
|
||||
e_g_n_k_wos_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< conv.GetTypeString() << std::endl;
|
||||
|
||||
if(config.do_verification)
|
||||
{
|
||||
auto ref_conv = HostConvFwdInstance<NDimSpatial>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in,
|
||||
wei,
|
||||
out_host,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
InElementOp{},
|
||||
WeiElementOp{},
|
||||
OutElementOp{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
out_device_buf.FromDevice(out_device.mData.data());
|
||||
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
const Tensor<OutUserDataType> out_device_converted(out_device);
|
||||
|
||||
return ck::utils::check_err(
|
||||
out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
#else
|
||||
return ck::utils::check_err(
|
||||
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
#endif
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run_grouped_conv_fwd_example(int argc, char* argv[])
|
||||
{
|
||||
ExecutionConfig config;
|
||||
ck::utils::conv::ConvParam conv_param = DefaultConvParam;
|
||||
|
||||
if(!parse_cmd_args(argc, argv, config, conv_param))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
switch(conv_param.num_dim_spatial_)
|
||||
{
|
||||
case 1: return run_grouped_conv_fwd<1>(config, conv_param);
|
||||
case 2: return run_grouped_conv_fwd<2>(config, conv_param);
|
||||
case 3: return run_grouped_conv_fwd<3>(config, conv_param);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp)
|
||||
|
||||
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp)
|
||||
|
||||
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp)
|
||||
|
||||
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp)
|
||||
|
||||
if(USE_BITINT_EXTENSION_INT4)
|
||||
add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp)
|
||||
endif() # USE_BITINT_EXTENSION_INT4
|
||||
@@ -1,34 +0,0 @@
|
||||
```bash
|
||||
#arg1: verification (0=no, 1=yes)
|
||||
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg3: time kernel (0=no, 1=yes)
|
||||
#Following arguments (depending on number of spatial dims):
|
||||
# N spatial dimensions
|
||||
# G, N, K, C,
|
||||
# <filter spatial dimensions>, (ie Y, X for 2D)
|
||||
# <input image spatial dimensions>, (ie Hi, Wi for 2D)
|
||||
# <strides>, (ie Sy, Sx for 2D)
|
||||
# <dilations>, (ie Dy, Dx for 2D)
|
||||
# <left padding>, (ie LeftPy, LeftPx for 2D)
|
||||
# <right padding>, (ie RightPy, RightPx for 2D)
|
||||
|
||||
bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1
|
||||
```
|
||||
|
||||
Result (MI100)
|
||||
```
|
||||
in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384}
|
||||
wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
|
||||
bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
|
||||
residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
|
||||
out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512}
|
||||
A[M, K]: {165888, 1728}
|
||||
B[N, K]: {256, 1728}
|
||||
Ds[M, N]: {165888, 256}
|
||||
Ds[M, N]: {165888, 256}
|
||||
E[M, N]: {165888, 256}
|
||||
launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1}
|
||||
Warm up 1 time
|
||||
Start running 10 times...
|
||||
Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default>
|
||||
```
|
||||
@@ -1,459 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = ck::bhalf_t;
|
||||
using WeiKernelDataType = ck::bhalf_t;
|
||||
using AccDataType = float;
|
||||
using CShuffleDataType = float;
|
||||
using BiasKernelDataType = ck::bhalf_t;
|
||||
using ResidualKernelDataType = ck::bhalf_t;
|
||||
using OutKernelDataType = ck::bhalf_t;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename ResidualLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, ResidualLayout>,
|
||||
OutLayout,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
32, // KPerBlock
|
||||
8, // AK1
|
||||
8, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
8>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
namespace ctc = ck::tensor_layout::convolution;
|
||||
|
||||
print_helper_msg();
|
||||
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// conventional group conv definition
|
||||
// G = 2
|
||||
// [N, C, Hi, Wi] = [128, 384, 71, 71]
|
||||
// [K, C, Y, X] = [512, 192, 3, 3]
|
||||
// [N, K, Ho, Wo] = [128, 512, 36, 36]
|
||||
// CK group conv definition
|
||||
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
|
||||
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
|
||||
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
|
||||
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
|
||||
}
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{};
|
||||
|
||||
if(conv_param.num_dim_spatial_ == 1)
|
||||
{
|
||||
using InLayout = ctc::G_NW_C;
|
||||
using WeiLayout = ctc::G_K_X_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NW_K;
|
||||
using OutLayout = ctc::G_NW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<1,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<1,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
using InLayout = ctc::G_NHW_C;
|
||||
using WeiLayout = ctc::G_K_YX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NHW_K;
|
||||
using OutLayout = ctc::G_NHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<2,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<2,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
using InLayout = ctc::G_NDHW_C;
|
||||
using WeiLayout = ctc::G_K_ZYX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NDHW_K;
|
||||
using OutLayout = ctc::G_NDHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1],
|
||||
conv_param.input_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1],
|
||||
conv_param.filter_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<3,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<3,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,459 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = ck::half_t;
|
||||
using WeiKernelDataType = ck::half_t;
|
||||
using AccDataType = float;
|
||||
using CShuffleDataType = ck::half_t;
|
||||
using BiasKernelDataType = ck::half_t;
|
||||
using ResidualKernelDataType = ck::half_t;
|
||||
using OutKernelDataType = ck::half_t;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename ResidualLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, ResidualLayout>,
|
||||
OutLayout,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
32, // KPerBlock
|
||||
8, // AK1
|
||||
8, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
8>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
namespace ctc = ck::tensor_layout::convolution;
|
||||
|
||||
print_helper_msg();
|
||||
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// conventional group conv definition
|
||||
// G = 2
|
||||
// [N, C, Hi, Wi] = [128, 384, 71, 71]
|
||||
// [K, C, Y, X] = [512, 192, 3, 3]
|
||||
// [N, K, Ho, Wo] = [128, 512, 36, 36]
|
||||
// CK group conv definition
|
||||
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
|
||||
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
|
||||
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
|
||||
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
|
||||
}
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{};
|
||||
|
||||
if(conv_param.num_dim_spatial_ == 1)
|
||||
{
|
||||
using InLayout = ctc::G_NW_C;
|
||||
using WeiLayout = ctc::G_K_X_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NW_K;
|
||||
using OutLayout = ctc::G_NW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<1,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<1,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
using InLayout = ctc::G_NHW_C;
|
||||
using WeiLayout = ctc::G_K_YX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NHW_K;
|
||||
using OutLayout = ctc::G_NHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<2,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<2,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
using InLayout = ctc::G_NDHW_C;
|
||||
using WeiLayout = ctc::G_K_ZYX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NDHW_K;
|
||||
using OutLayout = ctc::G_NDHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1],
|
||||
conv_param.input_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1],
|
||||
conv_param.filter_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<3,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<3,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,459 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = float;
|
||||
using WeiKernelDataType = float;
|
||||
using AccDataType = float;
|
||||
using CShuffleDataType = float;
|
||||
using BiasKernelDataType = float;
|
||||
using ResidualKernelDataType = float;
|
||||
using OutKernelDataType = float;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename ResidualLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, ResidualLayout>,
|
||||
OutLayout,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
16, // KPerBlock
|
||||
4, // AK1
|
||||
4, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
4, // ABlockTransferSrcScalarPerVector
|
||||
4, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
4, // BBlockTransferSrcScalarPerVector
|
||||
4, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 16, 1, 16>,
|
||||
4>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
namespace ctc = ck::tensor_layout::convolution;
|
||||
|
||||
print_helper_msg();
|
||||
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// conventional group conv definition
|
||||
// G = 2
|
||||
// [N, C, Hi, Wi] = [128, 384, 71, 71]
|
||||
// [K, C, Y, X] = [512, 192, 3, 3]
|
||||
// [N, K, Ho, Wo] = [128, 512, 36, 36]
|
||||
// CK group conv definition
|
||||
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
|
||||
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
|
||||
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
|
||||
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
|
||||
}
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{};
|
||||
|
||||
if(conv_param.num_dim_spatial_ == 1)
|
||||
{
|
||||
using InLayout = ctc::G_NW_C;
|
||||
using WeiLayout = ctc::G_K_X_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NW_K;
|
||||
using OutLayout = ctc::G_NW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<1,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<1,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
using InLayout = ctc::G_NHW_C;
|
||||
using WeiLayout = ctc::G_K_YX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NHW_K;
|
||||
using OutLayout = ctc::G_NHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<2,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<2,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
using InLayout = ctc::G_NDHW_C;
|
||||
using WeiLayout = ctc::G_K_ZYX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NDHW_K;
|
||||
using OutLayout = ctc::G_NDHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1],
|
||||
conv_param.input_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1],
|
||||
conv_param.filter_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<3,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<3,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,459 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = int8_t;
|
||||
using WeiKernelDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = int8_t;
|
||||
using BiasKernelDataType = int8_t;
|
||||
using ResidualKernelDataType = int8_t;
|
||||
using OutKernelDataType = int8_t;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = ck::int4_t;
|
||||
using WeiUserDataType = ck::int4_t;
|
||||
using OutUserDataType = ck::int4_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename ResidualLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, ResidualLayout>,
|
||||
OutLayout,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
16>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
namespace ctc = ck::tensor_layout::convolution;
|
||||
|
||||
print_helper_msg();
|
||||
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// conventional group conv definition
|
||||
// G = 2
|
||||
// [N, C, Hi, Wi] = [128, 384, 71, 71]
|
||||
// [K, C, Y, X] = [512, 192, 3, 3]
|
||||
// [N, K, Ho, Wo] = [128, 512, 36, 36]
|
||||
// CK group conv definition
|
||||
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
|
||||
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
|
||||
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
|
||||
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
|
||||
}
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{};
|
||||
|
||||
if(conv_param.num_dim_spatial_ == 1)
|
||||
{
|
||||
using InLayout = ctc::G_NW_C;
|
||||
using WeiLayout = ctc::G_K_X_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NW_K;
|
||||
using OutLayout = ctc::G_NW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<1,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<1,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
using InLayout = ctc::G_NHW_C;
|
||||
using WeiLayout = ctc::G_K_YX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NHW_K;
|
||||
using OutLayout = ctc::G_NHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<2,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<2,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
using InLayout = ctc::G_NDHW_C;
|
||||
using WeiLayout = ctc::G_K_ZYX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NDHW_K;
|
||||
using OutLayout = ctc::G_NDHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1],
|
||||
conv_param.input_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1],
|
||||
conv_param.filter_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<3,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<3,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,459 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "grouped_convnd_fwd_bias_relu_add_common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
// kernel data types
|
||||
using InKernelDataType = int8_t;
|
||||
using WeiKernelDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = int8_t;
|
||||
using BiasKernelDataType = int8_t;
|
||||
using ResidualKernelDataType = int8_t;
|
||||
using OutKernelDataType = int8_t;
|
||||
|
||||
// tensor data types
|
||||
using InUserDataType = InKernelDataType;
|
||||
using WeiUserDataType = WeiKernelDataType;
|
||||
using OutUserDataType = OutKernelDataType;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename ResidualLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, ResidualLayout>,
|
||||
OutLayout,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
16>;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
namespace ctc = ck::tensor_layout::convolution;
|
||||
|
||||
print_helper_msg();
|
||||
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
// conventional group conv definition
|
||||
// G = 2
|
||||
// [N, C, Hi, Wi] = [128, 384, 71, 71]
|
||||
// [K, C, Y, X] = [512, 192, 3, 3]
|
||||
// [N, K, Ho, Wo] = [128, 512, 36, 36]
|
||||
// CK group conv definition
|
||||
// [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71]
|
||||
// [G, K, C, Y, X] = [2, 256, 192, 3, 3]
|
||||
// [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36]
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 4)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
|
||||
conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
|
||||
}
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{};
|
||||
|
||||
if(conv_param.num_dim_spatial_ == 1)
|
||||
{
|
||||
using InLayout = ctc::G_NW_C;
|
||||
using WeiLayout = ctc::G_K_X_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NW_K;
|
||||
using OutLayout = ctc::G_NW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // k
|
||||
1, // c
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<1,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<1,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 2)
|
||||
{
|
||||
using InLayout = ctc::G_NHW_C;
|
||||
using WeiLayout = ctc::G_K_YX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NHW_K;
|
||||
using OutLayout = ctc::G_NHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<2,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<2,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
else if(conv_param.num_dim_spatial_ == 3)
|
||||
{
|
||||
using InLayout = ctc::G_NDHW_C;
|
||||
using WeiLayout = ctc::G_K_ZYX_C;
|
||||
using BiasLayout = ctc::G_K;
|
||||
using ResidualLayout = ctc::G_NDHW_K;
|
||||
using OutLayout = ctc::G_NDHW_K;
|
||||
|
||||
const auto in_g_n_c_wis_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.C_,
|
||||
conv_param.input_spatial_lengths_[0],
|
||||
conv_param.input_spatial_lengths_[1],
|
||||
conv_param.input_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.C_, // g
|
||||
conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
|
||||
1, // c
|
||||
conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.C_, // di
|
||||
conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
|
||||
conv_param.G_ * conv_param.C_ // wi
|
||||
});
|
||||
|
||||
const auto wei_g_k_c_xs_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.K_,
|
||||
conv_param.C_,
|
||||
conv_param.filter_spatial_lengths_[0],
|
||||
conv_param.filter_spatial_lengths_[1],
|
||||
conv_param.filter_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // g
|
||||
conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
|
||||
1, // c
|
||||
conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
|
||||
conv_param.C_, // z
|
||||
conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
|
||||
conv_param.C_ // x
|
||||
});
|
||||
|
||||
const auto bias_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto residual_g_n_k_wos_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // z
|
||||
0, // y
|
||||
0 // x
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc = HostTensorDescriptor(
|
||||
{conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1],
|
||||
conv_param.output_spatial_lengths_[2]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
|
||||
1, // k
|
||||
conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
|
||||
conv_param.G_ * conv_param.K_, // do
|
||||
conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
|
||||
conv_param.G_ * conv_param.K_ // wo
|
||||
});
|
||||
|
||||
return run_grouped_conv_fwd_bias_relu_add<3,
|
||||
InKernelDataType,
|
||||
WeiKernelDataType,
|
||||
CShuffleDataType,
|
||||
OutKernelDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
InUserDataType,
|
||||
WeiUserDataType,
|
||||
OutUserDataType,
|
||||
DeviceGroupedConvNDFwdInstance<3,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
BiasLayout,
|
||||
ResidualLayout,
|
||||
OutLayout>>(
|
||||
do_verification,
|
||||
init_method,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
bias_g_n_k_wos_desc,
|
||||
residual_g_n_k_wos_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -14,39 +14,38 @@ namespace device {
|
||||
// Convolution Forward:
|
||||
// input : input image A[G, N, C, Hi, Wi],
|
||||
// input : weight B[G, K, C, Y, X],
|
||||
// input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
|
||||
// output : output image E[G, N, K, Ho, Wo]
|
||||
// C = a_op(A) * b_op(B)
|
||||
// E = cde_op(C, D0, D1, ...)
|
||||
template <index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename CDataType,
|
||||
typename AElementwiseOperation,
|
||||
typename BElementwiseOperation,
|
||||
typename CElementwiseOperation>
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation>
|
||||
struct DeviceGroupedConvFwd : public BaseOperator
|
||||
{
|
||||
virtual std::unique_ptr<BaseArgument>
|
||||
MakeArgumentPointer(const void* p_a, // input image
|
||||
const void* p_b, // weight
|
||||
void* p_c, // output image
|
||||
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
|
||||
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
|
||||
const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
|
||||
MakeArgumentPointer(const void* p_in, // input image
|
||||
const void* p_wei, // weight
|
||||
void* p_out, // output image
|
||||
const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
|
||||
const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_strides,
|
||||
const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
const std::array<index_t, NDimSpatial>& input_right_pads,
|
||||
const AElementwiseOperation& a_element_op,
|
||||
const BElementwiseOperation& b_element_op,
|
||||
const CElementwiseOperation& c_element_op) = 0;
|
||||
const InElementwiseOperation& in_element_op,
|
||||
const WeiElementwiseOperation& wei_element_op,
|
||||
const OutElementwiseOperation& out_element_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
@@ -95,7 +95,7 @@ template <typename Activation>
|
||||
using Add_Activation_Mul_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Activation>;
|
||||
|
||||
template <typename DeviceOp>
|
||||
template <typename DeviceOp, typename Tag = void>
|
||||
struct DeviceOperationInstanceFactory;
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -3,11 +3,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
@@ -3,11 +3,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
@@ -9,12 +9,9 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
@@ -224,26 +221,25 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
|
||||
for(auto& op_ptr : op_ptrs)
|
||||
{
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(
|
||||
in_device_buf.GetDeviceBuffer(),
|
||||
wei_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
a_g_n_c_wis_lengths,
|
||||
a_g_n_c_wis_strides,
|
||||
b_g_k_c_xs_lengths,
|
||||
b_g_k_c_xs_strides,
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
|
||||
e_g_n_k_wos_lengths,
|
||||
e_g_n_k_wos_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
|
||||
wei_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
a_g_n_c_wis_lengths,
|
||||
a_g_n_c_wis_strides,
|
||||
b_g_k_c_xs_lengths,
|
||||
b_g_k_c_xs_strides,
|
||||
{},
|
||||
{},
|
||||
e_g_n_k_wos_lengths,
|
||||
e_g_n_k_wos_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
run_impl(op_ptr, argument_ptr);
|
||||
}
|
||||
@@ -262,8 +258,10 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
WeiElementOp,
|
||||
OutElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
for(auto& op_ptr : op_ptrs)
|
||||
|
||||
Reference in New Issue
Block a user