mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Conv + quantization + tanh (#645)
* Rename file. Prepare to support another activation
* Add comment for quantization
* Extract out_elementop
* Add tanh example
* Add conv + bias + tanh quantization instance
* Add missing parameter
* Refine cmake
* Add external api and client example
* Extract variable in example
* Fix the comment
---------
Co-authored-by: zjing14 <zhangjing14@gmail.com>
[ROCm/composable_kernel commit: 389e84a83b]
This commit is contained in:
@@ -1,6 +1,12 @@
|
||||
add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_bias_tanh_perlayer_quantization conv2d_fwd_bias_tanh_perlayer_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_tanh_perlayer_quantization PRIVATE composable_kernel::device_operations)
|
||||
|
||||
add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations)
|
||||
|
||||
|
||||
@@ -26,15 +26,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clam
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float requant_scale = 0.5f; // requantize qAcc to qz
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -102,26 +103,27 @@ int main(int argc, char* argv[])
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{0.5f, ActivationOp{}});
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
@@ -165,25 +167,26 @@ int main(int argc, char* argv[])
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{0.5f, ActivationOp{}});
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNHWK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp =
|
||||
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float sz_inv = 0.5f; // inverse of scale_z
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
|
||||
SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths, requant_scale_lengths},
|
||||
{bias_strides, requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths, requant_scale_lengths},
|
||||
{bias_strides, requant_scale_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,201 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using BiasLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNHWK;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float sacc = 0.5f; // scale of acc
|
||||
static constexpr float sz_inv = 0.5f; // inverse of scale_z
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
|
||||
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
std::array<ck::index_t, 2> conv_dilations{1, 1};
|
||||
|
||||
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
|
||||
SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
|
||||
SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
|
||||
|
||||
using DeviceOp =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
OutElementOp>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sacc, sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{bias.GetDeviceBuffer()},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{bias_lengths},
|
||||
{bias_strides},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{sacc, sz_inv, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -24,15 +24,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Ac
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
static constexpr float requant_scale = 0.5f; // requantize qAcc to qY
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -96,26 +97,27 @@ int main(int argc, char* argv[])
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{0.5f, ActivationOp{}});
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
@@ -158,25 +160,26 @@ int main(int argc, char* argv[])
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{0.5f, ActivationOp{}});
|
||||
auto argument_ptr =
|
||||
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
|
||||
wei.GetDeviceBuffer(),
|
||||
{},
|
||||
out.GetDeviceBuffer(),
|
||||
in_lengths,
|
||||
in_strides,
|
||||
weight_lengths,
|
||||
weight_strides,
|
||||
{},
|
||||
{},
|
||||
out_lengths,
|
||||
out_strides,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
in_left_pad,
|
||||
in_right_pad,
|
||||
PassThrough{},
|
||||
PassThrough{},
|
||||
OutElementOp{requant_scale, ActivationOp{}});
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
|
||||
@@ -14,3 +14,8 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_in
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
|
||||
# Conv + bias + tanh perlayer quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp)
|
||||
|
||||
# Conv + bias + tanh perchannel quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp)
|
||||
|
||||
@@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
};
|
||||
|
||||
@@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp =
|
||||
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
{
|
||||
float scale_z_inv = 0.5f;
|
||||
const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
};
|
||||
@@ -0,0 +1,85 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::TanH;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
{
|
||||
float scale_acc = 0.5f;
|
||||
float scale_z_inv = 0.5f;
|
||||
const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
@@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perchannel_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
};
|
||||
|
||||
@@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perchannel_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perlayer_quantization_example(); }
|
||||
int main()
|
||||
{
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
|
||||
}
|
||||
|
||||
@@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
|
||||
int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
@@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
|
||||
int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
@@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perchannel_quantization_example()
|
||||
int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
|
||||
@@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perlayer_quantization_example()
|
||||
int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
@@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example()
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
|
||||
@@ -7,10 +7,30 @@ namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace element_wise {
|
||||
|
||||
// Y = Sy * Qy
|
||||
// W = Sw * Qw
|
||||
// X = Sx * Qx
|
||||
// B = Sb * Qb = Sw * Sx * Qb
|
||||
// Where X, W, Y are float32, Qx, Qw, Qy are int8
|
||||
// Sx, Sw, Sy are scale of x, w, y (float32), which is calculated from quantization range
|
||||
// Qb is int32, scale of B is Sw * Sx for convenient
|
||||
|
||||
// Y = W @ X, where @ is convolution or matrix multiplication
|
||||
// Sy * Qy = Sw * Qw @ Sx * Qx
|
||||
// Qy = [(Sw*Sx)/Sy] * Qw @ Qx
|
||||
|
||||
// For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
|
||||
// Activation(Sy * Qy) = Sy * Activation(Qy)
|
||||
template <typename Activation>
|
||||
struct Activation_Mul_Clamp
|
||||
{
|
||||
// Convolution + Activation (piecewise linear function)
|
||||
// If an activation is piecewise linear function, then Activation(Sy * Qy) = Sy * Activation(Qy)
|
||||
// Z = Activation(Y) = Activation(W @ X)
|
||||
// Sz * Qz = Activation(Sy * Qy)
|
||||
// Qz = Sy / Sz * Activation(Qy) = (Sw * Sx / Sz) * Activation(Qw @ Qx)
|
||||
|
||||
// requantScale_ = Sw * Sx / Sz
|
||||
Activation_Mul_Clamp(float requantScale, Activation activationOp)
|
||||
: requantScale_(requantScale), activationOp_(activationOp)
|
||||
{
|
||||
@@ -45,8 +65,39 @@ struct Activation_Mul_Clamp
|
||||
Activation activationOp_;
|
||||
};
|
||||
|
||||
// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
|
||||
// If an activation is not piecewise linear function
|
||||
// then Activation(Sy * Qy) != Sy * Activation(Qy)
|
||||
template <typename Activation>
|
||||
struct Mul_Activation_Mul_Clamp
|
||||
{
|
||||
// Convolution + Activation (non piecewise linear function)
|
||||
// Z = Activation(Y) = Activation(W @ X)
|
||||
// Sz * Qz = Activation(Sy * Qy)
|
||||
// Qz = S1 * Activation[Sacc * (Qw @ Qx)]
|
||||
// Where S1 = 1 / Sz, Sacc = Sw * Sx
|
||||
Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
|
||||
: scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
|
||||
{
|
||||
float y_fp32 = ck::type_convert<float>(x);
|
||||
y_fp32 = scaleAcc_ * y_fp32;
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
float scale_z_inv_;
|
||||
float scaleAcc_;
|
||||
Activation activationOp_;
|
||||
};
|
||||
|
||||
// Conv Perchannel quantization + Activation function which is piecewise linear function, such as
|
||||
// relu, leaky relu ...etc
|
||||
// Activation(Sy * Qy) = Sy * Activation(Qy)
|
||||
template <typename Activation>
|
||||
struct Activation_Mul2_Clamp
|
||||
{
|
||||
@@ -76,9 +127,20 @@ struct Activation_Mul2_Clamp
|
||||
};
|
||||
|
||||
// For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
|
||||
// Activation(Sy * Qy) = Sy * Activation(Qy)
|
||||
template <typename Activation>
|
||||
struct Add_Activation_Mul_Clamp
|
||||
{
|
||||
// Convolution + bias
|
||||
// Let Bias = B = Sw * Sx * Qb
|
||||
// Where Qb is int32
|
||||
// Y = W @ X + B
|
||||
// Sy * Qy = Sw * Qw @ Sx * Qx + Sw * Sx * Qb
|
||||
// Qy = [(Sw*Sx)/Sy] * (Qw @ Qx + Qb)
|
||||
|
||||
// For activation, Z = Activaiton(Y)
|
||||
// Sz * Qz = Activation(Sy * Qy)
|
||||
// Qz = Sy / Sz * Activation(Qy) = [(Sw*Sx)/Sz] * Activation(Qw @ Qx + Qb)
|
||||
Add_Activation_Mul_Clamp(float requantScale, Activation activationOp)
|
||||
: requantScale_(requantScale), activationOp_(activationOp)
|
||||
{
|
||||
@@ -139,11 +201,18 @@ struct Add_Activation_Mul2_Clamp
|
||||
};
|
||||
|
||||
// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
|
||||
// If an activation is not piecewise linear function
|
||||
// then Activation(Sy * Qy) != Sy * Activation(Qy)
|
||||
template <typename Activation>
|
||||
struct Add_Mul_Activation_Mul_Clamp
|
||||
{
|
||||
Add_Mul_Activation_Mul_Clamp(float requantScale1, float requantScale2, Activation activationOp)
|
||||
: requantScale1_(requantScale1), requantScale2_(requantScale2), activationOp_(activationOp)
|
||||
// Convolution + Activation (non piecewise linear function)
|
||||
// Z = Activation(Y) = Activation(W @ X + B)
|
||||
// Sz * Qz = Activation(Sy * Qy)
|
||||
// Qz = S1 * Activation[Sacc * (Qw @ Qx + Qb)]
|
||||
// Where S1 = 1 / Sz, Sacc = Sw * Sx
|
||||
Add_Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp)
|
||||
: scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -151,14 +220,64 @@ struct Add_Mul_Activation_Mul_Clamp
|
||||
operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
|
||||
{
|
||||
float y_fp32 = ck::type_convert<float>(x + bias);
|
||||
y_fp32 = requantScale1_ * y_fp32;
|
||||
y_fp32 = scaleAcc_ * y_fp32;
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(requantScale2_ * y_fp32, -128.f, 127.f);
|
||||
y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
float requantScale1_;
|
||||
float requantScale2_;
|
||||
__host__ __device__ constexpr void
|
||||
operator()(int32_t& y, const int32_t& x, const int32_t& bias) const
|
||||
{
|
||||
// CAUSION - We might type_convert to int8 in threadwise copy
|
||||
// eg. GridwiseGemmDlMultipleD_km_kn_mn
|
||||
float y_fp32 = ck::type_convert<float>(x + bias);
|
||||
y_fp32 = scaleAcc_ * y_fp32;
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int32_t>(y_fp32);
|
||||
}
|
||||
|
||||
float scale_z_inv_;
|
||||
float scaleAcc_;
|
||||
Activation activationOp_;
|
||||
};
|
||||
|
||||
// Conv Perchannel quantization + Activation function which is non piecewise linear function,
|
||||
// such as TanH, Sigmoid ...etc
|
||||
// If an activation is not piecewise linear function
|
||||
// then Activation(Sy *Qy) != Sy * Activation(Qy)
|
||||
template <typename Activation>
|
||||
struct Add_Mul2_Activation_Mul_Clamp
|
||||
{
|
||||
Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp)
|
||||
: scale_z_inv_(scale_z_inv), activationOp_(activationOp)
|
||||
{
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void
|
||||
operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
|
||||
{
|
||||
float y_fp32 = ck::type_convert<float>(x + bias);
|
||||
y_fp32 = scaleAcc * y_fp32;
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void
|
||||
operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const
|
||||
{
|
||||
// CAUSION - We might type_convert to int8 in threadwise copy
|
||||
// eg. GridwiseGemmDlMultipleD_km_kn_mn
|
||||
float y_fp32 = ck::type_convert<float>(x + bias);
|
||||
y_fp32 = scaleAcc * y_fp32;
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int32_t>(y_fp32);
|
||||
}
|
||||
|
||||
float scale_z_inv_;
|
||||
Activation activationOp_;
|
||||
};
|
||||
|
||||
|
||||
@@ -320,6 +320,19 @@ struct Sigmoid
|
||||
int32_t divider_ = 1;
|
||||
};
|
||||
|
||||
struct TanH
|
||||
{
|
||||
template <typename T>
|
||||
__host__ __device__ void operator()(T& y, const T& x) const
|
||||
{
|
||||
static_assert(is_same<T, float>::value || is_same<T, double>::value ||
|
||||
is_same<T, ck::half_t>::value,
|
||||
"Data type is not supported by this operation!");
|
||||
|
||||
y = ck::math::tanh(x);
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace element_wise
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
@@ -92,6 +92,15 @@ static inline __host__ float sqrt(float x) { return std::sqrt(x); };
|
||||
|
||||
static inline __host__ double sqrt(double x) { return std::sqrt(x); };
|
||||
|
||||
static inline __host__ half_t tanh(half_t x)
|
||||
{
|
||||
return static_cast<half_t>(std::tanh(static_cast<float>(x)));
|
||||
};
|
||||
|
||||
static inline __host__ float tanh(float x) { return std::tanh(x); };
|
||||
|
||||
static inline __host__ double tanh(double x) { return std::tanh(x); };
|
||||
|
||||
// math functions for the HIP kernel, some are implemented by calling hip builtin functions
|
||||
|
||||
static inline __device__ float abs(float x) { return ::abs(x); };
|
||||
@@ -172,5 +181,14 @@ static inline __device__ float sqrt(float x) { return __builtin_amdgcn_sqrtf(x);
|
||||
|
||||
static inline __device__ double sqrt(double x) { return __builtin_amdgcn_sqrt(x); };
|
||||
|
||||
static inline __device__ half_t tanh(half_t x)
|
||||
{
|
||||
return static_cast<half_t>(::tanhf(static_cast<float>(x)));
|
||||
};
|
||||
|
||||
static inline __device__ float tanh(float x) { return ::tanhf(x); };
|
||||
|
||||
static inline __device__ double tanh(double x) { return ::tanh(x); };
|
||||
|
||||
} // namespace math
|
||||
} // namespace ck
|
||||
|
||||
@@ -85,6 +85,7 @@ using GK_GK_Tuple = ck::Tuple<GK, GK>;
|
||||
// pointwise functor
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Relu = ck::tensor_operation::element_wise::Relu;
|
||||
using TanH = ck::tensor_operation::element_wise::TanH;
|
||||
using Scale = ck::tensor_operation::element_wise::Scale;
|
||||
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
|
||||
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
|
||||
@@ -102,6 +103,10 @@ template <typename Activation>
|
||||
using Add_Activation_Mul_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Activation>;
|
||||
|
||||
template <typename Activation>
|
||||
using Add_Mul_Activation_Mul_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<Activation>;
|
||||
|
||||
template <typename Activation>
|
||||
using Activation_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Activation>;
|
||||
|
||||
@@ -109,6 +114,10 @@ template <typename Activation>
|
||||
using Add_Activation_Mul2_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Activation>;
|
||||
|
||||
template <typename Activation>
|
||||
using Add_Mul2_Activation_Mul_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<Activation>;
|
||||
|
||||
template <typename DeviceOp, typename Tag = void>
|
||||
struct DeviceOperationInstanceFactory;
|
||||
|
||||
|
||||
@@ -49,6 +49,22 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
|
||||
Add_Activation_Mul2_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
@@ -80,6 +96,23 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
|
||||
Add_Activation_Mul2_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul2_Activation_Mul_Clamp<TanH>>>>&
|
||||
instances);
|
||||
|
||||
// piecewise activation function
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
@@ -145,6 +178,67 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
}
|
||||
};
|
||||
|
||||
// non-piecewise activation function
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename DsLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename DsDataType,
|
||||
typename OutDataType,
|
||||
typename Activation>
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
DsLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DsDataType,
|
||||
OutDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Add_Mul2_Activation_Mul_Clamp<Activation>>>
|
||||
{
|
||||
using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
DsLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DsDataType,
|
||||
OutDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Add_Mul2_Activation_Mul_Clamp<Activation>>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
|
||||
is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
|
||||
is_same_v<OutLayout, GNHWK>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
|
||||
is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, TanH>)
|
||||
{
|
||||
add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -49,6 +49,21 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
|
||||
Add_Activation_Mul_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_Activation_Mul_Clamp<TanH>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
@@ -80,6 +95,22 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
|
||||
Add_Activation_Mul_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_Activation_Mul_Clamp<TanH>>>>&
|
||||
instances);
|
||||
|
||||
// piecewise activation function
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
@@ -145,6 +176,67 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
}
|
||||
};
|
||||
|
||||
// non-piecewise activation function
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename DsLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename DsDataType,
|
||||
typename OutDataType,
|
||||
typename Activation>
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
|
||||
NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
DsLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DsDataType,
|
||||
OutDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Add_Mul_Activation_Mul_Clamp<Activation>>>
|
||||
{
|
||||
using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
DsLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DsDataType,
|
||||
OutDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Add_Mul_Activation_Mul_Clamp<Activation>>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
|
||||
is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
|
||||
is_same_v<OutLayout, GNHWK>)
|
||||
{
|
||||
if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
|
||||
is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, TanH>)
|
||||
{
|
||||
add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -25,6 +25,7 @@ using GNHWK = ck::tensor_layout::convolution::GNHWK;
|
||||
using GK = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Relu = ck::tensor_operation::element_wise::Relu;
|
||||
using TanH = ck::tensor_operation::element_wise::TanH;
|
||||
|
||||
using GK_Tuple = ck::Tuple<GK>;
|
||||
using GK_GK_Tuple = ck::Tuple<GK, GK>;
|
||||
@@ -32,17 +33,25 @@ using I32_Tuple = ck::Tuple<int32_t>;
|
||||
using F32_Tuple = ck::Tuple<float>;
|
||||
using I32_F32_Tuple = ck::Tuple<int32_t, float>;
|
||||
|
||||
// perlayer
|
||||
using Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
|
||||
using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
|
||||
|
||||
// bias + perlayer
|
||||
using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
|
||||
using Add_Mul_TanH_Mul_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp<TanH>;
|
||||
|
||||
// perchannel
|
||||
using Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<PassThrough>;
|
||||
using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Relu>;
|
||||
|
||||
// bias + perchannel
|
||||
using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Relu>;
|
||||
using Add_Mul2_TanH_Mul_Clamp =
|
||||
ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp<TanH>;
|
||||
|
||||
static constexpr ck::index_t NDimSpatial = 2;
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
@@ -76,6 +76,42 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul2_TanH_Mul_Clamp>>>& instances)
|
||||
{
|
||||
// dl
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_TanH_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_TanH_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_TanH_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -76,6 +76,43 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_TanH_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_TanH_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_TanH_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_TanH_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -74,6 +74,41 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul2_TanH_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_TanH_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_TanH_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_TanH_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -76,6 +76,43 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_TanH_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_TanH_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_TanH_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_TanH_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
Reference in New Issue
Block a user