mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Add examples of Conv + reduction (data type: int4, int8, bf16, fp16, fp32) (#380)
* Refactor the design of DeviceGemmMultipleDMultipleR_Xdl_CShuffle * Add 'DeviceGroupedConvFwdMultipleDMultipleR' interface * Add DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle * Remove 'GridwiseConvFwdMultipleDMultipleR_xdl_cshuffle' * Add 'TransformConvFwdToGemm<>' utility class (from Chao) * Use 'TransformConvFwdToGemm<>' to shorten code * Fix ill-formed method declaration * Re-implement MakeRGridDescriptor_M() function * Change problem description * Use macro to define layout types * Define K-reduced output tensor layout types * Let user to decide R output tensor layout * Rename variables * Add padding to the reduced output tensor if necessary * Extract common code as helper method * Remove debug message * Add missing include directive * Add partial fp16 Conv + Reduction example * Add example verification code for 2D Conv problem * Use type alias to simplify code * Share code across different-dimension Conv problems * Rename file/functions from run_conv_fwd* to run_convnd_fwd* * Make example code more verbose * Add code to support 1D & 3D Conv + Reduction on host * Add more examples for data type: bf16, fp32 * Add example for int8 * Add custom target to group examples * Use more general custom target name * Change the description in error message * Disable testing for example other than fp32 * Add examplel for int4 (just copy from int8) * Fix wrong data type * Use larger data type for intermediate tensors * Finish int4 example * Undefine macro PP_DEFINE_LAYOUT_TYPE() after use * Use named variables to replace magic numbers * Remove debug messages * Use same A/B data type for host Conv in int4 example * Add check for the 'RLayout' type argument * Group same-dim-layouts together in 'LayoutSetting<>' * Add 'final' specifier to utility classes * Use different initialization method for examples * Remove macro PP_DEFINE_LAYOUT_TYPE() * Fix code-comment mismatch * Use more reasonable initialization value for all data types * Default use init_method=1 for all examples * Remove never-used code * Remove confusing out-of-date comments * clean Co-authored-by: Chao Liu <chao.liu2@amd.com> Co-authored-by: Chao Liu <lc.roy86@gmail.com>
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
add_custom_target(example_convnd_fwd_reduce_xdl)
|
||||
|
||||
add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
|
||||
add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
|
||||
add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
|
||||
add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
|
||||
|
||||
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
|
||||
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
|
||||
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
|
||||
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
|
||||
|
||||
if(USE_BITINT_EXTENSION_INT4)
|
||||
add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
|
||||
add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
|
||||
endif(USE_BITINT_EXTENSION_INT4)
|
||||
167
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
Normal file
167
example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
Normal file
@@ -0,0 +1,167 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
#include <numeric>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/fill.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
using BF16 = ck::bhalf_t;
|
||||
using FP16 = ck::half_t;
|
||||
using FP32 = float;
|
||||
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
using I4 = ck::int4_t;
|
||||
#endif
|
||||
using I8 = std::int8_t;
|
||||
using I32 = std::int32_t;
|
||||
|
||||
template <typename ALay, typename BLay, typename DELay, typename RLay>
|
||||
struct LayoutSetting
|
||||
{
|
||||
using ALayout = ALay;
|
||||
using BLayout = BLay;
|
||||
using DELayout = DELay;
|
||||
using RLayout = RLay;
|
||||
};
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
struct LayoutSettingSelector;
|
||||
|
||||
namespace ctl = ck::tensor_layout::convolution;
|
||||
|
||||
template <>
|
||||
struct LayoutSettingSelector<1> final : LayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK, ctl::GNW>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct LayoutSettingSelector<2> final : LayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK, ctl::GNHW>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct LayoutSettingSelector<3> final
|
||||
: LayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK, ctl::GNDHW>
|
||||
{
|
||||
};
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using ALayout = typename LayoutSettingSelector<NDimSpatial>::ALayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using BLayout = typename LayoutSettingSelector<NDimSpatial>::BLayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using DELayout = typename LayoutSettingSelector<NDimSpatial>::DELayout;
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
|
||||
|
||||
struct ExecutionConfig final
|
||||
{
|
||||
bool do_verification = true;
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
};
|
||||
|
||||
inline void print_help_msg()
|
||||
{
|
||||
std::cerr << "arg1: verification (0=no, 1=yes)\n"
|
||||
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
|
||||
<< "arg3: time kernel (0=no, 1=yes)\n"
|
||||
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
|
||||
}
|
||||
|
||||
inline bool parse_cmd_args(int argc,
|
||||
char* argv[],
|
||||
ck::utils::conv::ConvParam& problem_size,
|
||||
ExecutionConfig& config)
|
||||
{
|
||||
constexpr int num_execution_config_args =
|
||||
3; // arguments for do_verification, init_method, time_kernel
|
||||
constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
|
||||
|
||||
constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
|
||||
constexpr int threshold_to_catch_all_args =
|
||||
threshold_to_catch_partial_args + num_conv_param_leading_args;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
// catch only ExecutionConfig arguments
|
||||
else if(argc == threshold_to_catch_partial_args)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
// catch both ExecutionConfig & ConvParam arguments
|
||||
else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
|
||||
const ck::index_t num_dim_spatial = std::stoi(argv[4]);
|
||||
problem_size = ck::utils::conv::parse_conv_param(
|
||||
num_dim_spatial, threshold_to_catch_partial_args, argv);
|
||||
}
|
||||
else
|
||||
{
|
||||
print_help_msg();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
inline HostTensorDescriptor
|
||||
make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
|
||||
{
|
||||
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
|
||||
|
||||
std::copy(begin(problem_size.output_spatial_lengths_),
|
||||
end(problem_size.output_spatial_lengths_),
|
||||
std::back_inserter(dimensions));
|
||||
|
||||
return HostTensorDescriptor(dimensions);
|
||||
}
|
||||
|
||||
template <typename Lengths, typename Strides>
|
||||
void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
|
||||
Lengths& lengths,
|
||||
Strides& strides)
|
||||
{
|
||||
assert(size(descriptor.GetLengths()) == size(lengths));
|
||||
std::copy_n(begin(descriptor.GetLengths()), size(descriptor.GetLengths()), begin(lengths));
|
||||
|
||||
assert(size(descriptor.GetStrides()) == size(strides));
|
||||
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
|
||||
}
|
||||
|
||||
template <typename Range, typename OutputIterator>
|
||||
auto copy(const Range& range, OutputIterator iter)
|
||||
-> decltype(std::copy(std::begin(range), std::end(range), iter))
|
||||
{
|
||||
return std::copy(std::begin(range), std::end(range), iter);
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using ADataType = BF16;
|
||||
using BDataType = BF16;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = BF16;
|
||||
using ReduceAccDataType = FP32;
|
||||
using R0DataType = FP32;
|
||||
using RsDataType = ck::Tuple<R0DataType>;
|
||||
|
||||
#include "run_convnd_fwd_max_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using ADataType = FP16;
|
||||
using BDataType = FP16;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = FP16;
|
||||
using ReduceAccDataType = FP32;
|
||||
using R0DataType = FP32;
|
||||
using RsDataType = ck::Tuple<R0DataType>;
|
||||
|
||||
#include "run_convnd_fwd_max_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using ADataType = FP32;
|
||||
using BDataType = FP32;
|
||||
using AccDataType = FP32;
|
||||
using CShuffleDataType = FP32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = FP32;
|
||||
using ReduceAccDataType = FP32;
|
||||
using R0DataType = FP32;
|
||||
using RsDataType = ck::Tuple<R0DataType>;
|
||||
|
||||
#include "run_convnd_fwd_max_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
|
||||
@@ -0,0 +1,26 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
|
||||
#error Should compile this file with ck::int4_t support
|
||||
#endif
|
||||
|
||||
#define BUILD_INT4_EXAMPLE
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using ADataType = I4;
|
||||
using BDataType = I4;
|
||||
using KernelADataType = I8;
|
||||
using KernelBDataType = I8;
|
||||
using AccDataType = I32;
|
||||
using CShuffleDataType = I32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = I32;
|
||||
using ReduceAccDataType = I32;
|
||||
using R0DataType = I32;
|
||||
using RsDataType = ck::Tuple<R0DataType>;
|
||||
|
||||
#include "run_convnd_fwd_max_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
|
||||
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
using ADataType = I8;
|
||||
using BDataType = I8;
|
||||
using AccDataType = I32;
|
||||
using CShuffleDataType = I32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = I32;
|
||||
using ReduceAccDataType = I32;
|
||||
using R0DataType = I32;
|
||||
using RsDataType = ck::Tuple<R0DataType>;
|
||||
|
||||
#include "run_convnd_fwd_max_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
|
||||
@@ -0,0 +1,313 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using CDEElementOp = PassThrough;
|
||||
using QsElementOp = ck::Tuple<PassThrough>;
|
||||
using RsElementOp = ck::Tuple<PassThrough>;
|
||||
|
||||
// ReduceOp
|
||||
using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
|
||||
|
||||
using RsGlobalReduceOp =
|
||||
ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
// clang-format off
|
||||
template <ck::index_t NDimSpatial>
|
||||
using DeviceInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
|
||||
//######| NDimSpatial| ALayout| BLayout| DELayout| RLayout| AData| BData| AccData| CShuffle| DsData| EData| ReduceAccData| RsData| A| B| CDE| Qs| Rs| Thread| Global| Conv| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CDRThreadTransfer| CDE| RThreadTransfer|
|
||||
//######| | | | | | Type| Type| Type| DataType| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Reduce| Reduce| Fwd|Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
|
||||
//######| | | | | | | | | | | | | | Operation| Operation| Operation| Operation| Operation| Operation| Operation| Specialization| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _MPerBlock_NPerBlock| ScalarPerVector| _MPerBlock|
|
||||
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | _NPerBlock| |
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
|
||||
#else
|
||||
< NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementOp, BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp, ConvSpec, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<64, 4>, 4, 1>;
|
||||
#endif
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
using HostInstance = ck::tensor_operation::host::ReferenceConvFwd
|
||||
<NDimSpatial, ADataType, BDataType, EDataType, AElementOp, BElementOp, PassThrough>;
|
||||
// clang-format on
|
||||
|
||||
template <ck::index_t NDimSpatial>
|
||||
bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
|
||||
const ExecutionConfig& config)
|
||||
{
|
||||
static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
|
||||
|
||||
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
|
||||
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
|
||||
#endif
|
||||
|
||||
const auto conv_input_g_n_c_wis_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ALayout<NDimSpatial>>(
|
||||
problem_size);
|
||||
|
||||
const auto conv_weight_g_k_c_xs_desc =
|
||||
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<BLayout<NDimSpatial>>(
|
||||
problem_size);
|
||||
|
||||
const auto conv_output_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<DELayout<NDimSpatial>>(
|
||||
problem_size);
|
||||
|
||||
const auto r0_desc = make_r0_host_tensor_descriptor(problem_size);
|
||||
|
||||
Tensor<ADataType> conv_input(conv_input_g_n_c_wis_desc);
|
||||
Tensor<BDataType> conv_weight(conv_weight_g_k_c_xs_desc);
|
||||
Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
|
||||
Tensor<R0DataType> r0_device(r0_desc);
|
||||
|
||||
switch(config.init_method)
|
||||
{
|
||||
case 0: break;
|
||||
case 1:
|
||||
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(),
|
||||
conv_input.end());
|
||||
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
|
||||
conv_weight.end());
|
||||
break;
|
||||
default:
|
||||
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end());
|
||||
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(),
|
||||
conv_weight.end());
|
||||
}
|
||||
|
||||
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
|
||||
DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize());
|
||||
DeviceMem conv_output_device_buf(sizeof(EDataType) *
|
||||
conv_output_device.mDesc.GetElementSpaceSize());
|
||||
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
|
||||
|
||||
#ifdef BUILD_INT4_EXAMPLE
|
||||
const Tensor<KernelADataType> conv_input_converted(conv_input);
|
||||
const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
|
||||
|
||||
conv_input_device_buf.ToDevice(conv_input_converted.mData.data());
|
||||
conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data());
|
||||
#else
|
||||
conv_input_device_buf.ToDevice(conv_input.mData.data());
|
||||
conv_weight_device_buf.ToDevice(conv_weight.mData.data());
|
||||
#endif
|
||||
|
||||
std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
|
||||
conv_input_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> conv_weight_g_k_c_xs_lengths{},
|
||||
conv_weight_g_k_c_xs_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
|
||||
conv_output_g_n_k_wos_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
|
||||
|
||||
unpack_host_tensor_descriptor(
|
||||
conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
|
||||
unpack_host_tensor_descriptor(
|
||||
conv_weight_g_k_c_xs_desc, conv_weight_g_k_c_xs_lengths, conv_weight_g_k_c_xs_strides);
|
||||
unpack_host_tensor_descriptor(
|
||||
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
|
||||
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
|
||||
|
||||
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
|
||||
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
|
||||
copy(problem_size.input_left_pads_, begin(input_left_pads));
|
||||
copy(problem_size.input_right_pads_, begin(input_right_pads));
|
||||
|
||||
// run Conv + Reduction on device
|
||||
auto conv = DeviceInstance<NDimSpatial>{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
|
||||
conv_weight_device_buf.GetDeviceBuffer(),
|
||||
std::array<const void*, 0>{},
|
||||
conv_output_device_buf.GetDeviceBuffer(),
|
||||
{r0_device_buf.GetDeviceBuffer()},
|
||||
conv_input_g_n_c_wis_lengths,
|
||||
conv_input_g_n_c_wis_strides,
|
||||
conv_weight_g_k_c_xs_lengths,
|
||||
conv_weight_g_k_c_xs_strides,
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
|
||||
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
|
||||
conv_output_g_n_k_wos_lengths,
|
||||
conv_output_g_n_k_wos_strides,
|
||||
r0_lengths,
|
||||
r0_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
AElementOp{},
|
||||
BElementOp{},
|
||||
CDEElementOp{},
|
||||
QsElementOp{},
|
||||
RsElementOp{});
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
std::cerr << "wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem"
|
||||
<< std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
|
||||
|
||||
const std::size_t flop = problem_size.GetFlops();
|
||||
const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
|
||||
|
||||
const float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
const float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< conv.GetTypeString() << std::endl;
|
||||
|
||||
if(config.do_verification)
|
||||
{
|
||||
Tensor<EDataType> conv_output_host(conv_output_g_n_k_wos_desc);
|
||||
|
||||
// run Conv + Reduction on host
|
||||
auto ref_conv = HostInstance<NDimSpatial>{};
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(conv_input,
|
||||
conv_weight,
|
||||
conv_output_host,
|
||||
problem_size.conv_filter_strides_,
|
||||
problem_size.conv_filter_dilations_,
|
||||
problem_size.input_left_pads_,
|
||||
problem_size.input_right_pads_,
|
||||
AElementOp{},
|
||||
BElementOp{},
|
||||
PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
Tensor<R0DataType> r0_host(r0_device.mDesc);
|
||||
|
||||
auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
|
||||
|
||||
auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths();
|
||||
|
||||
if constexpr(NDimSpatial == 1)
|
||||
{
|
||||
for(std::size_t g = 0; g < output_dims[0]; ++g)
|
||||
{
|
||||
for(std::size_t n = 0; n < output_dims[1]; ++n)
|
||||
{
|
||||
for(std::size_t w = 0; w < output_dims[3]; ++w)
|
||||
{
|
||||
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
|
||||
for(std::size_t k = 0; k < output_dims[2]; ++k)
|
||||
{
|
||||
|
||||
auto e_val =
|
||||
ck::type_convert<ReduceAccDataType>(conv_output_host(g, n, k, w));
|
||||
reduce0_op(reduce0_acc, e_val);
|
||||
}
|
||||
r0_host(g, n, w) = ck::type_convert<R0DataType>(reduce0_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 2)
|
||||
{
|
||||
for(std::size_t g = 0; g < output_dims[0]; ++g)
|
||||
{
|
||||
for(std::size_t n = 0; n < output_dims[1]; ++n)
|
||||
{
|
||||
for(std::size_t h = 0; h < output_dims[3]; ++h)
|
||||
{
|
||||
for(std::size_t w = 0; w < output_dims[4]; ++w)
|
||||
{
|
||||
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
|
||||
for(std::size_t k = 0; k < output_dims[2]; ++k)
|
||||
{
|
||||
|
||||
auto e_val = ck::type_convert<ReduceAccDataType>(
|
||||
conv_output_host(g, n, k, h, w));
|
||||
reduce0_op(reduce0_acc, e_val);
|
||||
}
|
||||
r0_host(g, n, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if constexpr(NDimSpatial == 3)
|
||||
{
|
||||
for(std::size_t g = 0; g < output_dims[0]; ++g)
|
||||
{
|
||||
for(std::size_t n = 0; n < output_dims[1]; ++n)
|
||||
{
|
||||
for(std::size_t d = 0; d < output_dims[3]; ++d)
|
||||
{
|
||||
for(std::size_t h = 0; h < output_dims[4]; ++h)
|
||||
{
|
||||
for(std::size_t w = 0; w < output_dims[5]; ++w)
|
||||
{
|
||||
auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
|
||||
for(std::size_t k = 0; k < output_dims[2]; ++k)
|
||||
{
|
||||
|
||||
auto e_val = ck::type_convert<ReduceAccDataType>(
|
||||
conv_output_host(g, n, k, d, h, w));
|
||||
reduce0_op(reduce0_acc, e_val);
|
||||
}
|
||||
r0_host(g, n, d, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
conv_output_device_buf.FromDevice(conv_output_device.mData.data());
|
||||
r0_device_buf.FromDevice(r0_device.mData.data());
|
||||
|
||||
return ck::utils::check_err(conv_output_device.mData,
|
||||
conv_output_host.mData,
|
||||
"Error: incorrect results! (Matrix E)",
|
||||
1e-5f,
|
||||
1e-4f) &&
|
||||
ck::utils::check_err(r0_device.mData,
|
||||
r0_host.mData,
|
||||
"Error: incorrect results! (Matrix R0)",
|
||||
1e-5f,
|
||||
1e-4f);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool run_convnd_fwd_max_example(int argc, char* argv[])
|
||||
{
|
||||
ck::utils::conv::ConvParam problem_size{
|
||||
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
|
||||
ExecutionConfig config;
|
||||
|
||||
if(!parse_cmd_args(argc, argv, problem_size, config))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
switch(problem_size.num_dim_spatial_)
|
||||
{
|
||||
case 1: return run_convnd_fwd_max<1>(problem_size, config);
|
||||
case 2: return run_convnd_fwd_max<2>(problem_size, config);
|
||||
case 3: return run_convnd_fwd_max<3>(problem_size, config);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@@ -26,6 +26,7 @@ add_subdirectory(02_gemm_bilinear)
|
||||
add_subdirectory(03_gemm_bias_relu)
|
||||
add_subdirectory(04_gemm_add_add_fastgelu)
|
||||
add_subdirectory(09_convnd_fwd)
|
||||
add_subdirectory(10_convnd_fwd_multiple_d_multiple_reduce)
|
||||
add_subdirectory(12_reduce)
|
||||
add_subdirectory(13_pool2d_fwd)
|
||||
add_subdirectory(14_gemm_xdl_requant_relu_requant)
|
||||
|
||||
@@ -0,0 +1,77 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/device_base.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
|
||||
// Grouped Convolution Forward:
|
||||
// input : input image A[G, N, C, Hi, Wi],
|
||||
// input : weight B[G, K, C, Y, X],
|
||||
// input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
|
||||
// output : output image E[G, N, K, Ho, Wo]
|
||||
// output : R0[G, N, Ho, Wo], R1[G, N, Ho, Wo], ...
|
||||
// C = a_op(A) * b_op(B)
|
||||
// E = cde_op(C, D0, D1, ...)
|
||||
// Q0 = reduce0(q_op0(E)), Q1 = reduce1(q_op0(E)), ...
|
||||
// R0 = r_op0(Q0), R1 = r_op1(Q1), ...
|
||||
// Assume:
|
||||
// D0, D1, ... and E have the same layout
|
||||
template <index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename DELayout,
|
||||
typename RLayout,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename DsDataType,
|
||||
typename EDataType,
|
||||
typename RsDataType,
|
||||
typename AElementwiseOperation,
|
||||
typename BElementwiseOperation,
|
||||
typename CDEElementwiseOperation,
|
||||
typename QsElementwiseOperation,
|
||||
typename RsElementwiseOperation>
|
||||
struct DeviceGroupedConvFwdMultipleDMultipleR : public BaseOperator
|
||||
{
|
||||
static constexpr index_t NumDTensor = DsDataType::Size();
|
||||
static constexpr index_t NumRTensor = RsDataType::Size();
|
||||
|
||||
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
|
||||
const void* p_a,
|
||||
const void* p_b,
|
||||
const std::array<const void*, NumDTensor>& p_ds,
|
||||
void* p_e,
|
||||
std::array<void*, NumRTensor> p_rs,
|
||||
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
|
||||
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
|
||||
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
|
||||
const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
|
||||
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
|
||||
const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
|
||||
const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_strides,
|
||||
const std::array<index_t, NDimSpatial>& conv_filter_dilations,
|
||||
const std::array<index_t, NDimSpatial>& input_left_pads,
|
||||
const std::array<index_t, NDimSpatial>& input_right_pads,
|
||||
const AElementwiseOperation& a_element_op,
|
||||
const BElementwiseOperation& b_element_op,
|
||||
const CDEElementwiseOperation& cde_element_op,
|
||||
const QsElementwiseOperation& qs_element_op,
|
||||
const RsElementwiseOperation& rs_element_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
};
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
File diff suppressed because it is too large
Load Diff
@@ -93,7 +93,7 @@ struct GNDHWC : public BaseTensorLayout
|
||||
};
|
||||
|
||||
// input tensor
|
||||
// packed GNWC/GNHWC/GNDHWC
|
||||
// packed NWGC/NHWGC/NDHWGC
|
||||
struct NWGC : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NWGC";
|
||||
@@ -330,6 +330,54 @@ struct G_NDHW_K : public BaseTensorLayout
|
||||
static constexpr const char* name = "G_NDHW_K";
|
||||
};
|
||||
|
||||
// K-reduced output tensor (packed)
|
||||
struct GNW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "GNW";
|
||||
};
|
||||
|
||||
struct GNHW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "GNHW";
|
||||
};
|
||||
|
||||
struct GNDHW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "GNDHW";
|
||||
};
|
||||
|
||||
// K-reduced output tensor (packed)
|
||||
struct NWG : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NWG";
|
||||
};
|
||||
|
||||
struct NHWG : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NHWG";
|
||||
};
|
||||
|
||||
struct NDHWG : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "NDHWG";
|
||||
};
|
||||
|
||||
// K-reduced output tensor (strided)
|
||||
struct G_NW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "G_NW";
|
||||
};
|
||||
|
||||
struct G_NHW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "G_NHW";
|
||||
};
|
||||
|
||||
struct G_NDHW : public BaseTensorLayout
|
||||
{
|
||||
static constexpr const char* name = "G_NDHW";
|
||||
};
|
||||
|
||||
} // namespace convolution
|
||||
|
||||
template <
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace utils {
|
||||
|
||||
Reference in New Issue
Block a user