mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
Conv + quantization + tanh (#645)
* Rename file. Prepare to support another activation * Add comment for quantization * Extract out_elementop * Add tanh example * Add conv + bias + tanh quantization instance * Add missing parameter * Refine cmake * Add external api and client example * Extract variable in example * Fix the comment --------- Co-authored-by: zjing14 <zhangjing14@gmail.com>
This commit is contained in:
@@ -14,3 +14,8 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_in
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
|
||||
# Conv + bias + tanh perlayer quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
|
||||
|
||||
# Conv + bias + tanh perchannel quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
|
||||
|
||||
@@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
};
|
||||
|
||||
@@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp =
|
||||
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
{
|
||||
float scale_z_inv = 0.5f;
|
||||
const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
};
|
||||
@@ -0,0 +1,85 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
{
|
||||
float scale_acc = 0.5f;
|
||||
float scale_z_inv = 0.5f;
|
||||
const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perchannel_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
};
|
||||
|
||||
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perchannel_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
|
||||
int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
|
||||
int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
@@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perchannel_quantization_example()
|
||||
int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
|
||||
@@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perlayer_quantization_example()
|
||||
int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
@@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
|
||||
Reference in New Issue
Block a user