mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
gemm/Conv xdlops + dlops quantization (#625)
* Add conv perlayer quantization * Add gemm_dlops quantization * Support int8 for innerproduct * Refine gemm dlops int8 kernel parameter * Support gfx908(MI100) and gfx90a(MI200) * clang-format * Rename example number * Support different layout for d tensor * Add conv dlops perchannel quantization example * Move to example 40 * Extract the common code for different platform (dlops and xdlops) * Move ot subfolder. Prepare to add other op of quantization * Refine the quantization instance library * Add conv dl instances and client example * Remove unnecessary type * Add gemm quantization instance * Add external api and client example * Refine num_bytes * Separete different layout to different cpp * Add more xdl instances * Revert "Remove unnecessary type" This reverts commit820869182f. * Remove CShuffleDataType in dlops Let acc and CShuffleDataType be the same in xdlops --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> [ROCm/composable_kernel commit:16dc18e0f9]
This commit is contained in:
@@ -9,3 +9,6 @@ target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composab
|
||||
|
||||
add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp)
|
||||
target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations)
|
||||
|
||||
add_executable(client_gemm_quantization gemm_quantization.cpp)
|
||||
target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
|
||||
|
||||
@@ -28,16 +28,15 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Cla
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4;
|
||||
static constexpr ck::index_t K = 64;
|
||||
static constexpr ck::index_t C = 32;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 71;
|
||||
static constexpr ck::index_t Wi = 71;
|
||||
static constexpr ck::index_t Ho = 36;
|
||||
static constexpr ck::index_t Wo = 36;
|
||||
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
@@ -64,8 +63,8 @@ int main(int argc, char* argv[])
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
@@ -136,10 +135,11 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
|
||||
G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
@@ -162,11 +162,12 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
@@ -26,15 +26,15 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clam
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4;
|
||||
static constexpr ck::index_t K = 64;
|
||||
static constexpr ck::index_t C = 32;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 71;
|
||||
static constexpr ck::index_t Wi = 71;
|
||||
static constexpr ck::index_t Ho = 36;
|
||||
static constexpr ck::index_t Wo = 36;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -60,8 +60,8 @@ int main(int argc, char* argv[])
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
@@ -130,10 +130,10 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
|
||||
G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
@@ -156,11 +156,12 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
@@ -26,15 +26,15 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_C
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4;
|
||||
static constexpr ck::index_t K = 64;
|
||||
static constexpr ck::index_t C = 32;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 71;
|
||||
static constexpr ck::index_t Wi = 71;
|
||||
static constexpr ck::index_t Ho = 36;
|
||||
static constexpr ck::index_t Wo = 36;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -60,8 +60,8 @@ int main(int argc, char* argv[])
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
@@ -130,10 +130,10 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
|
||||
G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X;
|
||||
std::size_t num_bytes =
|
||||
G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C +
|
||||
G * sizeof(RequantScaleDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
@@ -156,11 +156,12 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
@@ -24,15 +24,15 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Ac
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 2;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 4;
|
||||
static constexpr ck::index_t K = 64;
|
||||
static constexpr ck::index_t C = 32;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Hi = 71;
|
||||
static constexpr ck::index_t Wi = 71;
|
||||
static constexpr ck::index_t Ho = 36;
|
||||
static constexpr ck::index_t Wo = 36;
|
||||
static constexpr ck::index_t N = 4; // batch size
|
||||
static constexpr ck::index_t K = 64; // output channel
|
||||
static constexpr ck::index_t C = 192; // input channel
|
||||
static constexpr ck::index_t Y = 3; // filter H
|
||||
static constexpr ck::index_t X = 3; // filter W
|
||||
static constexpr ck::index_t Hi = 71; // input H
|
||||
static constexpr ck::index_t Wi = 71; // input W
|
||||
static constexpr ck::index_t Ho = 36; // output H
|
||||
static constexpr ck::index_t Wo = 36; // output W
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -56,8 +56,8 @@ int main(int argc, char* argv[])
|
||||
std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
|
||||
std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
|
||||
std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
|
||||
std::array<ck::index_t, 5> out_lengths{G, N, K, Ho, Wo};
|
||||
std::array<ck::index_t, 5> out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K};
|
||||
std::array<ck::index_t, 2> in_left_pad{1, 1};
|
||||
std::array<ck::index_t, 2> in_right_pad{1, 1};
|
||||
std::array<ck::index_t, 2> conv_strides{2, 2};
|
||||
@@ -150,11 +150,11 @@ int main(int argc, char* argv[])
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
// run the best intance
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
|
||||
193
client_example/09_quantization/gemm_quantization.cpp
Normal file
193
client_example/09_quantization/gemm_quantization.cpp
Normal file
@@ -0,0 +1,193 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/quantization/gemm_quantization.hpp"
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
using ADataType = int8_t;
|
||||
using BDataType = int8_t;
|
||||
using EDataType = int8_t;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using ELayout = Row;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
SimpleDeviceMem() = delete;
|
||||
|
||||
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
|
||||
{
|
||||
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
|
||||
}
|
||||
|
||||
void* GetDeviceBuffer() { return p_mem_; }
|
||||
|
||||
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
|
||||
|
||||
void* p_mem_;
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t K = 1024;
|
||||
|
||||
ck::index_t StrideA = 1024;
|
||||
ck::index_t StrideB = 1024;
|
||||
ck::index_t StrideE = 1024;
|
||||
|
||||
float requant_scale = 0.03;
|
||||
|
||||
auto f_matrix_space_size =
|
||||
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
|
||||
using Layout = decltype(layout);
|
||||
|
||||
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return (nRow - 1) * stride + nCol;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (nCol - 1) * stride + nRow;
|
||||
}
|
||||
};
|
||||
|
||||
SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
|
||||
SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
|
||||
SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
|
||||
BLayout,
|
||||
ck::Tuple<>,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck::Tuple<>,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp>;
|
||||
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto a_element_op = AElementOp{};
|
||||
const auto b_element_op = BElementOp{};
|
||||
const auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
|
||||
|
||||
std::string best_op_name;
|
||||
int best_op_id = -1;
|
||||
float best_avg_time = std::numeric_limits<float>::max();
|
||||
float best_gb_per_sec = 0;
|
||||
float best_tflops = 0;
|
||||
|
||||
// profile device operation instances
|
||||
std::cout << "Run all instances and do timing" << std::endl;
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
std::string op_name = op_ptr->GetTypeString();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_bytes =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_bytes / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
|
||||
<< gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_id = i;
|
||||
best_op_name = op_name;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_tflops = tflops;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << op_name << " does not support this problem" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
if(best_op_id != -1)
|
||||
{
|
||||
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
|
||||
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
|
||||
|
||||
auto& op_ptr = op_ptrs[best_op_id];
|
||||
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
|
||||
<< std::endl;
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
auto invoker_ptr = op_ptr->MakeInvokerPointer();
|
||||
|
||||
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
|
||||
{
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
}
|
||||
|
||||
std::cout << "Done" << std::endl;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,2 +1,6 @@
|
||||
# dlops
|
||||
add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp)
|
||||
|
||||
# xdlops
|
||||
add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
|
||||
add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
|
||||
204
example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
Normal file
204
example/14_gemm_quantization/gemm_dl_quantization_int8.cpp
Normal file
@@ -0,0 +1,204 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <iostream>
|
||||
#include <numeric>
|
||||
#include <initializer_list>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using I8 = int8_t;
|
||||
using I32 = int32_t;
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using AElementOp = PassThrough;
|
||||
using BElementOp = PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
using ADataType = I8;
|
||||
using BDataType = I8;
|
||||
using AccDataType = I32;
|
||||
using CShuffleDataType = I32;
|
||||
using DsDataType = ck::Tuple<>;
|
||||
using EDataType = I8;
|
||||
|
||||
using ALayout = Row;
|
||||
using BLayout = Col;
|
||||
using DsLayout = ck::Tuple<>;
|
||||
using ELayout = Row;
|
||||
|
||||
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
|
||||
|
||||
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Dl<
|
||||
ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
DsDataType,
|
||||
EDataType,
|
||||
AElementOp,
|
||||
BElementOp,
|
||||
CDEElementOp,
|
||||
GemmDefault,
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::
|
||||
ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
|
||||
|
||||
int main()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
|
||||
// GEMM shape
|
||||
ck::index_t M = 1024;
|
||||
ck::index_t N = 1024;
|
||||
ck::index_t K = 1024;
|
||||
|
||||
ck::index_t StrideA = 1024;
|
||||
ck::index_t StrideB = 1024;
|
||||
ck::index_t StrideE = 1024;
|
||||
|
||||
float requant_scale = 0.03;
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1_uz}));
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1_uz, stride}));
|
||||
}
|
||||
};
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
|
||||
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
|
||||
|
||||
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
|
||||
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
|
||||
|
||||
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
|
||||
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
|
||||
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
|
||||
|
||||
a_device_buf.ToDevice(a_m_k.mData.data());
|
||||
b_device_buf.ToDevice(b_k_n.mData.data());
|
||||
|
||||
auto a_element_op = AElementOp{};
|
||||
auto b_element_op = BElementOp{};
|
||||
auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
|
||||
|
||||
// do GEMM
|
||||
auto gemm = DeviceGemmInstance{};
|
||||
auto invoker = gemm.MakeInvoker();
|
||||
auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
|
||||
b_device_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
e_device_buf.GetDeviceBuffer(),
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
StrideA,
|
||||
StrideB,
|
||||
{},
|
||||
StrideE,
|
||||
a_element_op,
|
||||
b_element_op,
|
||||
cde_element_op);
|
||||
|
||||
if(!gemm.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_gemm with the specified compilation parameters does "
|
||||
"not support this GEMM problem");
|
||||
}
|
||||
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_btype =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
|
||||
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< gemm.GetTypeString() << std::endl;
|
||||
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
auto ref_gemm = ReferenceGemmInstance{};
|
||||
auto ref_invoker = ref_gemm.MakeInvoker();
|
||||
|
||||
auto ref_argument = ref_gemm.MakeArgument(
|
||||
a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
16
example/40_conv2d_fwd_quantization/CMakeLists.txt
Normal file
16
example/40_conv2d_fwd_quantization/CMakeLists.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
# Conv perlayer quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
|
||||
|
||||
# Conv perchannel quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_perchannel_quantization_int8 conv2d_fwd_dl_perchannel_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_perchannel_quantization_int8.cpp)
|
||||
|
||||
# Conv + bias + relu perlayer quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_relu_perlayer_quantization_int8 conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
|
||||
|
||||
# Conv + bias + relu perchannel quantization
|
||||
add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
|
||||
18
example/40_conv2d_fwd_quantization/common.hpp
Normal file
18
example/40_conv2d_fwd_quantization/common.hpp
Normal file
@@ -0,0 +1,18 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
@@ -0,0 +1,81 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
|
||||
@@ -0,0 +1,79 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
|
||||
@@ -0,0 +1,79 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<RequantScaleDataType>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perchannel_quantization_example(); }
|
||||
@@ -0,0 +1,74 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
|
||||
NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
AccDataType,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
128, // NPerBlock
|
||||
16, // K0PerBlock
|
||||
4, // K1
|
||||
4, // M1PerThread
|
||||
4, // N1PerThread
|
||||
1, // KPerThread
|
||||
S<8, 2>, // M1N1ThreadClusterM1Xs
|
||||
S<8, 2>, // M1N1ThreadClusterN1Xs
|
||||
S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1
|
||||
S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
|
||||
S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
|
||||
S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1
|
||||
S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder
|
||||
S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
|
||||
S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder
|
||||
S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
|
||||
S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder
|
||||
5, // CThreadTransferSrcDstVectorDim
|
||||
4>; // CThreadTransferDstScalarPerVector
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perlayer_quantization_example(); }
|
||||
@@ -0,0 +1,85 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = AccDataType;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); };
|
||||
@@ -0,0 +1,83 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = AccDataType;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); }
|
||||
@@ -0,0 +1,83 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = AccDataType;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<RequantScaleDataType>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perchannel_quantization_example(); }
|
||||
@@ -0,0 +1,78 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = AccDataType;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
16>;
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main() { run_conv2d_fwd_perlayer_quantization_example(); }
|
||||
@@ -1,97 +1,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using RequantScaleDataType = float;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename RequantScaleLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout, RequantScaleLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasDataType, RequantScaleDataType>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
|
||||
#pragma once
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
@@ -219,12 +129,12 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
|
||||
Tensor<AccDataType> c_host(out_g_n_k_wos_desc);
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
CShuffleDataType,
|
||||
AccDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
PassThrough>();
|
||||
@@ -257,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int main()
|
||||
int run_conv2d_fwd_bias_relu_perchannel_quantization_example()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -268,7 +178,7 @@ int main()
|
||||
1, // group
|
||||
4, // batch
|
||||
64, // output channels
|
||||
32, // input chanels
|
||||
192, // input chanels
|
||||
{3, 3}, // weight HW
|
||||
{71, 71}, // x HW
|
||||
{2, 2}, // strides
|
||||
@@ -312,8 +222,6 @@ int main()
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
|
||||
|
||||
std::cout << out_g_n_k_wos_desc << std::endl;
|
||||
|
||||
using deviceOp = DeviceGroupedConvNDFwdInstance<ndim_spatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
@@ -1,94 +1,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using BiasDataType = int32_t;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = ck::tensor_operation::element_wise::Relu;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename BiasLayout,
|
||||
typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<BiasLayout>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<BiasDataType>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
8>;
|
||||
#pragma once
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
@@ -205,12 +118,12 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
|
||||
Tensor<AccDataType> c_host(out_g_n_k_wos_desc);
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
CShuffleDataType,
|
||||
AccDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
PassThrough>();
|
||||
@@ -242,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int main()
|
||||
int run_conv2d_fwd_bias_relu_perlayer_quantization_example()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
@@ -253,7 +166,7 @@ int main()
|
||||
1, // group
|
||||
4, // batch
|
||||
64, // output channels
|
||||
32, // input chanels
|
||||
192, // input chanels
|
||||
{3, 3}, // weight HW
|
||||
{71, 71}, // x HW
|
||||
{2, 2}, // strides
|
||||
@@ -294,8 +207,6 @@ int main()
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
|
||||
|
||||
std::cout << out_g_n_k_wos_desc << std::endl;
|
||||
|
||||
return run_grouped_conv_fwd<
|
||||
ndim_spatial,
|
||||
InDataType,
|
||||
@@ -0,0 +1,235 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementOp,
|
||||
typename WeiElementOp,
|
||||
typename OutElementOp,
|
||||
typename DeviceConvNDFwdInstance>
|
||||
bool run_grouped_conv_fwd(bool do_verification,
|
||||
bool time_kernel,
|
||||
const ck::utils::conv::ConvParam& conv_param,
|
||||
const HostTensorDescriptor& in_g_n_c_wis_desc,
|
||||
const HostTensorDescriptor& wei_g_k_c_xs_desc,
|
||||
const HostTensorDescriptor& requant_scale_g_k_desc,
|
||||
const HostTensorDescriptor& out_g_n_k_wos_desc,
|
||||
const InElementOp& in_element_op,
|
||||
const WeiElementOp& wei_element_op,
|
||||
const OutElementOp& out_element_op)
|
||||
{
|
||||
Tensor<InDataType> in(in_g_n_c_wis_desc);
|
||||
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
|
||||
Tensor<RequantScaleDataType> requant_scale(requant_scale_g_k_desc);
|
||||
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
|
||||
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
|
||||
|
||||
std::cout << "in: " << in.mDesc << std::endl;
|
||||
std::cout << "wei: " << wei.mDesc << std::endl;
|
||||
std::cout << "requant_scale: " << requant_scale.mDesc << std::endl;
|
||||
std::cout << "out: " << out_host.mDesc << std::endl;
|
||||
|
||||
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-128, 127});
|
||||
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-128, 127});
|
||||
requant_scale.GenerateTensorValue(GeneratorTensor_2<RequantScaleDataType>{0, 1});
|
||||
|
||||
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
|
||||
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
|
||||
DeviceMem requant_scale_device_buf(sizeof(RequantScaleDataType) *
|
||||
requant_scale.mDesc.GetElementSpaceSize());
|
||||
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
|
||||
|
||||
in_device_buf.ToDevice(in.mData.data());
|
||||
wei_device_buf.ToDevice(wei.mData.data());
|
||||
requant_scale_device_buf.ToDevice(requant_scale.mData.data());
|
||||
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
|
||||
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
|
||||
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
std::array<ck::index_t, NDimSpatial> input_right_pads{};
|
||||
|
||||
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
|
||||
|
||||
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
|
||||
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
|
||||
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
|
||||
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
|
||||
copy(requant_scale_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
|
||||
copy(requant_scale_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
|
||||
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
|
||||
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
|
||||
copy(conv_param.conv_filter_strides_, conv_filter_strides);
|
||||
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
|
||||
copy(conv_param.input_left_pads_, input_left_pads);
|
||||
copy(conv_param.input_right_pads_, input_right_pads);
|
||||
|
||||
// do Conv
|
||||
auto conv = DeviceConvNDFwdInstance{};
|
||||
auto invoker = conv.MakeInvoker();
|
||||
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
|
||||
wei_device_buf.GetDeviceBuffer(),
|
||||
{requant_scale_device_buf.GetDeviceBuffer()},
|
||||
out_device_buf.GetDeviceBuffer(),
|
||||
a_g_n_c_wis_lengths,
|
||||
a_g_n_c_wis_strides,
|
||||
b_g_k_c_xs_lengths,
|
||||
b_g_k_c_xs_strides,
|
||||
{d0_g_n_k_wos_lengths},
|
||||
{d0_g_n_k_wos_strides},
|
||||
e_g_n_k_wos_lengths,
|
||||
e_g_n_k_wos_strides,
|
||||
conv_filter_strides,
|
||||
conv_filter_dilations,
|
||||
input_left_pads,
|
||||
input_right_pads,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
|
||||
if(!conv.IsSupportedArgument(argument))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"wrong! device_conv with the specified compilation parameters does "
|
||||
"not support this Conv problem");
|
||||
}
|
||||
|
||||
float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
|
||||
<< conv.GetTypeString() << std::endl;
|
||||
|
||||
bool pass = true;
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<AccDataType> c_host(out_g_n_k_wos_desc);
|
||||
|
||||
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
PassThrough>();
|
||||
|
||||
auto ref_invoker = ref_conv.MakeInvoker();
|
||||
auto ref_argument = ref_conv.MakeArgument(in,
|
||||
wei,
|
||||
c_host,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
PassThrough{});
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
|
||||
// TODO: implement elementwise operation for host
|
||||
out_host.ForEach([&](auto&, auto idx) {
|
||||
out_element_op(out_host(idx), c_host(idx), requant_scale(idx));
|
||||
});
|
||||
|
||||
out_device_buf.FromDevice(out_device.mData.data());
|
||||
|
||||
pass &=
|
||||
ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
|
||||
}
|
||||
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perchannel_quantization_example()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
const ck::index_t ndim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
ndim_spatial, // n_dim
|
||||
1, // group
|
||||
4, // batch
|
||||
64, // output channels
|
||||
192, // input chanels
|
||||
{3, 3}, // weight HW
|
||||
{71, 71}, // x HW
|
||||
{2, 2}, // strides
|
||||
{1, 1}, // dilations
|
||||
{1, 1}, // left_pads
|
||||
{1, 1} // right_pads
|
||||
};
|
||||
|
||||
const auto in_element_op = InElementOp{};
|
||||
const auto wei_element_op = WeiElementOp{};
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::GNHWC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKYXC;
|
||||
using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
|
||||
using OutLayout = ck::tensor_layout::convolution::GNHWK;
|
||||
|
||||
const auto in_g_n_c_wis_desc =
|
||||
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
|
||||
|
||||
const auto wei_g_k_c_xs_desc =
|
||||
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
|
||||
|
||||
const auto requant_scale_g_k_desc =
|
||||
HostTensorDescriptor({conv_param.G_,
|
||||
conv_param.N_,
|
||||
conv_param.K_,
|
||||
conv_param.output_spatial_lengths_[0],
|
||||
conv_param.output_spatial_lengths_[1]},
|
||||
{
|
||||
conv_param.K_, // g
|
||||
0, // n
|
||||
1, // k
|
||||
0, // ho
|
||||
0 // wo
|
||||
});
|
||||
|
||||
const auto out_g_n_k_wos_desc =
|
||||
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
|
||||
|
||||
using deviceOp = DeviceGroupedConvNDFwdInstance<ndim_spatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
RequantScaleLayout,
|
||||
OutLayout>;
|
||||
|
||||
return run_grouped_conv_fwd<ndim_spatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
deviceOp>(do_verification,
|
||||
time_kernel,
|
||||
conv_param,
|
||||
in_g_n_c_wis_desc,
|
||||
wei_g_k_c_xs_desc,
|
||||
requant_scale_g_k_desc,
|
||||
out_g_n_k_wos_desc,
|
||||
in_element_op,
|
||||
wei_element_op,
|
||||
out_element_op);
|
||||
}
|
||||
@@ -1,89 +1,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
|
||||
|
||||
using InDataType = int8_t;
|
||||
using WeiDataType = int8_t;
|
||||
using AccDataType = int32_t;
|
||||
using CShuffleDataType = int32_t;
|
||||
using OutDataType = int8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using ActivationOp = PassThrough;
|
||||
using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<>,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
64, // KPerBlock
|
||||
16, // AK1
|
||||
16, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
16, // ABlockTransferSrcScalarPerVector
|
||||
16, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
16, // BBlockTransferSrcScalarPerVector
|
||||
16, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 64, 1, 4>,
|
||||
16>;
|
||||
#pragma once
|
||||
|
||||
template <ck::index_t NDimSpatial,
|
||||
typename InDataType,
|
||||
@@ -221,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int main()
|
||||
int run_conv2d_fwd_perlayer_quantization_example()
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
bool time_kernel = false;
|
||||
const ck::index_t ndim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
@@ -232,7 +150,7 @@ int main()
|
||||
1, // group
|
||||
4, // batch
|
||||
64, // output channels
|
||||
32, // input chanels
|
||||
192, // input chanels
|
||||
{3, 3}, // weight HW
|
||||
{71, 71}, // x HW
|
||||
{2, 2}, // strides
|
||||
@@ -1,3 +0,0 @@
|
||||
add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
|
||||
add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
|
||||
@@ -134,7 +134,8 @@ __global__ void
|
||||
const Block2CTileMap block_2_ctile_map,
|
||||
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__) || \
|
||||
defined(__gfx90a__) || defined(__gfx908__))
|
||||
// offset base pointer for each work-group
|
||||
const index_t num_blocks_per_batch =
|
||||
__builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
|
||||
@@ -314,9 +315,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
|
||||
const auto in_gemmm_gemmk_desc =
|
||||
matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
|
||||
|
||||
const auto M = in_gemmm_gemmk_desc.GetLength(I0);
|
||||
const auto K = in_gemmm_gemmk_desc.GetLength(I1);
|
||||
|
||||
const auto M = in_gemmm_gemmk_desc.GetLength(I0);
|
||||
const auto K = in_gemmm_gemmk_desc.GetLength(I1);
|
||||
const auto AK0 = K / K1;
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
@@ -709,7 +709,8 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
|
||||
namespace ctc = tensor_layout::convolution;
|
||||
|
||||
// check device
|
||||
if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
|
||||
if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030" ||
|
||||
ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx908"))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -834,6 +835,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// check Gridwise GEMM
|
||||
return GridwiseGemm::CheckValidity(
|
||||
arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_);
|
||||
|
||||
@@ -51,7 +51,7 @@ __global__ void
|
||||
const Block2CTileMap block_2_ctile_map)
|
||||
{
|
||||
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \
|
||||
defined(__gfx1030__))
|
||||
defined(__gfx90a__) || defined(__gfx1030__))
|
||||
|
||||
constexpr index_t shared_block_size =
|
||||
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
|
||||
@@ -552,7 +552,7 @@ struct DeviceGemmMultipleD_Dl : public DeviceGemmMultipleD<ALayout,
|
||||
static bool IsSupportedArgument(const Argument& arg)
|
||||
{
|
||||
if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx908" ||
|
||||
ck::get_device_name() == "gfx1030")
|
||||
ck::get_device_name() == "gfx90a" || ck::get_device_name() == "gfx1030")
|
||||
{
|
||||
return GridwiseGemm::CheckValidity(
|
||||
arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.e_grid_desc_m_n_);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
// #include "ck/utility/get_id.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -17,18 +18,27 @@ struct Activation_Mul_Clamp
|
||||
|
||||
__host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
|
||||
{
|
||||
float x_fp32 = ck::type_convert<float>(x);
|
||||
activationOp_(x_fp32, x_fp32);
|
||||
float y_fp32 = math::clamp(requantScale_ * x_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
float y_fp32 = ck::type_convert<float>(x);
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void operator()(float& y, const int32_t& x) const
|
||||
__device__ constexpr void operator()(int32_t& y, const int32_t& x) const
|
||||
{
|
||||
// We might type_convert to int8 after lambda in someplace
|
||||
float x_fp32 = ck::type_convert<float>(x);
|
||||
activationOp_(x_fp32, x_fp32);
|
||||
y = math::clamp(requantScale_ * x_fp32, -128.f, 127.f);
|
||||
// CAUSION - We might type_convert to int8 in threadwise copy
|
||||
// eg. GridwiseGemmDlMultipleD_km_kn_mn
|
||||
float y_fp32 = ck::type_convert<float>(x);
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int32_t>(y_fp32);
|
||||
}
|
||||
|
||||
__host__ constexpr void operator()(float& y, const float& x) const
|
||||
{
|
||||
// CAUSION - We might float in & float out in reference code
|
||||
activationOp_(y, x);
|
||||
y = math::clamp(requantScale_ * y, -128.f, 127.f);
|
||||
}
|
||||
|
||||
float requantScale_;
|
||||
@@ -51,6 +61,17 @@ struct Activation_Mul2_Clamp
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
__device__ constexpr void
|
||||
operator()(int32_t& y, const int32_t& x, const float& requantScale) const
|
||||
{
|
||||
// CAUSION - We might type_convert to int8 in threadwise copy
|
||||
// eg. GridwiseGemmDlMultipleD_km_kn_mn
|
||||
float y_fp32 = ck::type_convert<float>(x);
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int32_t>(y_fp32);
|
||||
}
|
||||
|
||||
Activation activationOp_;
|
||||
};
|
||||
|
||||
@@ -72,6 +93,17 @@ struct Add_Activation_Mul_Clamp
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void
|
||||
operator()(int32_t& y, const int32_t& x, const int32_t& bias) const
|
||||
{
|
||||
// CAUSION - We might type_convert to int8 in threadwise copy
|
||||
// eg. GridwiseGemmDlMultipleD_km_kn_mn
|
||||
float y_fp32 = ck::type_convert<float>(x + bias);
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int32_t>(y_fp32);
|
||||
}
|
||||
|
||||
float requantScale_;
|
||||
Activation activationOp_;
|
||||
};
|
||||
@@ -92,6 +124,17 @@ struct Add_Activation_Mul2_Clamp
|
||||
y = ck::type_convert<int8_t>(y_fp32);
|
||||
}
|
||||
|
||||
__host__ __device__ constexpr void
|
||||
operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& requantScale) const
|
||||
{
|
||||
// CAUSION - We might type_convert to int8 in threadwise copy
|
||||
// eg. GridwiseGemmDlMultipleD_km_kn_mn
|
||||
float y_fp32 = ck::type_convert<float>(x + bias);
|
||||
activationOp_(y_fp32, y_fp32);
|
||||
y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
|
||||
y = ck::type_convert<int32_t>(y_fp32);
|
||||
}
|
||||
|
||||
Activation activationOp_;
|
||||
};
|
||||
|
||||
|
||||
@@ -185,8 +185,10 @@ struct GridwiseGemmDlMultipleD_km_kn_mn
|
||||
return b_grid_desc_k0_n0_n1_k1;
|
||||
}
|
||||
|
||||
// E desc for destination in blockwise copy
|
||||
template <typename CGridDesc_M_N_>
|
||||
__host__ __device__ static constexpr auto
|
||||
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(const CGridDesc_M_N& c_grid_desc_m_n)
|
||||
MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(const CGridDesc_M_N_& c_grid_desc_m_n)
|
||||
{
|
||||
const auto M = c_grid_desc_m_n.GetLength(I0);
|
||||
const auto N = c_grid_desc_m_n.GetLength(I1);
|
||||
|
||||
@@ -135,6 +135,28 @@ __device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const h
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void inner_product<int8_t, int8_t, int32_t>(const int8_t& a, const int8_t& b, int32_t& c)
|
||||
{
|
||||
c += type_convert<int32_t>(a) * type_convert<int32_t>(b);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<int8x2_t, int8x2_t, int32_t>(const int8x2_t& a, const int8x2_t& b, int32_t& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
inner_product(vector_type<int8_t, 2>{a}.AsType<int8_t>()[I0],
|
||||
vector_type<int8_t, 2>{b}.AsType<int8_t>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<int8_t, 2>{a}.AsType<int8_t>()[I1],
|
||||
vector_type<int8_t, 2>{b}.AsType<int8_t>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
|
||||
|
||||
@@ -0,0 +1,224 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Col, Row, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Col, Col, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Row, Row, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Row, Col, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Col, Row, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Col, Col, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Row, Row, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
// Layout(A, B, C) = [Row, Col, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
template <typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename EDataType,
|
||||
typename Activation>
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
|
||||
ALayout,
|
||||
BLayout,
|
||||
Empty_Tuple,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
Empty_Tuple,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Activation_Mul_Clamp<Activation>>>
|
||||
{
|
||||
using DeviceOp = DeviceGemmMultipleD<ALayout,
|
||||
BLayout,
|
||||
Empty_Tuple,
|
||||
ELayout,
|
||||
ADataType,
|
||||
BDataType,
|
||||
Empty_Tuple,
|
||||
EDataType,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
Activation_Mul_Clamp<Activation>>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
|
||||
is_same_v<EDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
|
||||
is_same_v<ELayout, Row>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
{
|
||||
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
|
||||
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
|
||||
is_same_v<ELayout, Row>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
{
|
||||
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
|
||||
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
|
||||
is_same_v<ELayout, Row>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
{
|
||||
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
|
||||
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
|
||||
is_same_v<ELayout, Row>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
{
|
||||
add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
|
||||
add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -18,7 +18,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
|
||||
void add_device_conv2d_bias_perchannel_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
|
||||
Add_Activation_Mul2_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Activation_Mul2_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Activation_Mul2_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
add_device_conv2d_bias_perchannel_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<Activation, Relu>)
|
||||
add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
|
||||
void add_device_conv2d_bias_perlayer_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
@@ -34,7 +34,38 @@ void add_device_conv2d_bias_perlayer_quantization_int8_instances(
|
||||
Add_Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Activation_Mul_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
|
||||
std::vector<
|
||||
std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -98,9 +129,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
add_device_conv2d_bias_perlayer_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<Activation, Relu>)
|
||||
add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
|
||||
void add_device_conv2d_perchannel_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -33,7 +33,37 @@ void add_device_conv2d_perchannel_quantization_int8_instances(
|
||||
Activation_Mul2_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_relu_perchannel_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul2_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul2_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -97,9 +127,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
add_device_conv2d_perchannel_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_perchannel_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_perchannel_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<Activation, Relu>)
|
||||
add_device_conv2d_relu_perchannel_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
// grouped conv2d forward, GNHWC/GKYXC/GNHWK
|
||||
void add_device_conv2d_perlayer_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -33,7 +33,37 @@ void add_device_conv2d_perlayer_quantization_int8_instances(
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_relu_perlayer_quantization_int8_instances(
|
||||
void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<Relu>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Activation_Mul_Clamp<PassThrough>>>>&
|
||||
instances);
|
||||
|
||||
void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -94,9 +124,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
is_same_v<OutDataType, int8_t>)
|
||||
{
|
||||
if constexpr(is_same_v<Activation, PassThrough>)
|
||||
add_device_conv2d_perlayer_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_perlayer_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_perlayer_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(is_same_v<Activation, Relu>)
|
||||
add_device_conv2d_relu_perlayer_quantization_int8_instances(op_ptrs);
|
||||
{
|
||||
add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(op_ptrs);
|
||||
add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,38 @@
|
||||
add_instance_library(device_quantization_instance
|
||||
device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
|
||||
device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
|
||||
device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
|
||||
device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
|
||||
set(CONV2D_PERLAYER_QUANT_SRC
|
||||
conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
|
||||
conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
|
||||
)
|
||||
|
||||
set(CONV2D_PERCHANNEL_QUANT_SRC
|
||||
conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
|
||||
conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
|
||||
)
|
||||
|
||||
set(CONV2D_BIAS_PERLAYER_QUANT_SRC
|
||||
conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
|
||||
conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
|
||||
)
|
||||
|
||||
set(CONV2D_BIAS_PERCHANNEL_QUANT_SRC
|
||||
conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
|
||||
conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
|
||||
)
|
||||
|
||||
set(GEMM_QUANT_SRC
|
||||
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
|
||||
gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
|
||||
)
|
||||
|
||||
add_instance_library(device_quantization_instance
|
||||
${CONV2D_PERLAYER_QUANT_SRC}
|
||||
${CONV2D_PERCHANNEL_QUANT_SRC}
|
||||
${CONV2D_BIAS_PERLAYER_QUANT_SRC}
|
||||
${CONV2D_BIAS_PERCHANNEL_QUANT_SRC}
|
||||
${GEMM_QUANT_SRC}
|
||||
)
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using Empty_Tuple = ck::Tuple<>;
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using GNHWC = ck::tensor_layout::convolution::GNHWC;
|
||||
using GKYXC = ck::tensor_layout::convolution::GKYXC;
|
||||
using GNHWK = ck::tensor_layout::convolution::GNHWK;
|
||||
using GK = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Relu = ck::tensor_operation::element_wise::Relu;
|
||||
|
||||
using GK_Tuple = ck::Tuple<GK>;
|
||||
using GK_GK_Tuple = ck::Tuple<GK, GK>;
|
||||
using I32_Tuple = ck::Tuple<int32_t>;
|
||||
using F32_Tuple = ck::Tuple<float>;
|
||||
using I32_F32_Tuple = ck::Tuple<int32_t, float>;
|
||||
|
||||
using Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
|
||||
using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
|
||||
|
||||
using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
|
||||
|
||||
using Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<PassThrough>;
|
||||
using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Relu>;
|
||||
|
||||
using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Relu>;
|
||||
|
||||
static constexpr ck::index_t NDimSpatial = 2;
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
static constexpr auto ConvFwd1x1P0 =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
|
||||
static constexpr auto ConvFwd1x1S1P0 =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,82 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_dl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_dl_bias_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
// dl
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Relu_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
// dl
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,82 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_dl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_dl_bias_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Relu_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,33 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "conv2d_quantization_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// clang-format off
|
||||
template <typename DsLayout,
|
||||
typename DsDatatype,
|
||||
typename OutElementOp,
|
||||
ConvolutionForwardSpecialization ConvSpec,
|
||||
index_t DstScalarPerVector>
|
||||
using device_grouped_conv2d_dl_int8_instances =
|
||||
std::tuple<
|
||||
// ###########################################| NDim| InData| WeiData| MultpleD| OutData| AccData| InLayout| WeiLayout| MultipleD| OutLayout| In| Wei| Out| Convolution| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
|
||||
// ###########################################| Spatial| Type| Type| Type| Type| Type| | | Layout| | Elementwise| Elementwise| Elementwise| Forward| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
|
||||
// ###########################################| | | | | | | | | | | Operation| Operation| Operation| Specialization| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
|
||||
// ###########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK< NDimSpatial, int8_t, int8_t, DsDatatype, int8_t, int32_t, GNHWC, GKYXC, DsLayout, GNHWK, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>, 5, DstScalarPerVector>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,80 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_dl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_dl_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_dl_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Relu_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Relu_Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Relu_Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Relu_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,80 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_dl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_dl_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_dl_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Relu_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Relu_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Relu_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
4>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_dl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Relu_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
4>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -7,7 +7,7 @@ namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_bias_perchannel_quantization_int8_instances(
|
||||
void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -22,23 +22,26 @@ void add_device_conv2d_bias_perchannel_quantization_int8_instances(
|
||||
Add_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwdDefault>{});
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwd1x1P0>{});
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0>{});
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
|
||||
void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
@@ -53,20 +56,23 @@ void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
|
||||
Add_Relu_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwdDefault>{});
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwd1x1P0>{});
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0>{});
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_GK_Tuple,
|
||||
I32_F32_Tuple,
|
||||
Add_Relu_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
@@ -0,0 +1,82 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_xdl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Relu_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,45 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "conv2d_quantization_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// clang-format off
|
||||
template <typename DsLayout,
|
||||
typename DsDatatype,
|
||||
typename OutElementOp,
|
||||
ConvolutionForwardSpecialization ConvSpec,
|
||||
index_t DstScalarPerVector>
|
||||
using device_grouped_conv2d_xdl_int8_instances =
|
||||
std::tuple <
|
||||
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
|
||||
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
|
||||
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
|
||||
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<NDimSpatial, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, DstScalarPerVector>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,80 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_xdl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_xdl_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_xdl_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Relu_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Relu_Mul2_Clamp,
|
||||
ConvFwdDefault,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Relu_Mul2_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
8>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<GK_Tuple,
|
||||
F32_Tuple,
|
||||
Relu_Mul2_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
8>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,80 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_xdl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_xdl_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
16>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
16>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
16>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_xdl_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Relu_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Relu_Mul_Clamp,
|
||||
ConvFwdDefault,
|
||||
16>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Relu_Mul_Clamp,
|
||||
ConvFwd1x1P0,
|
||||
16>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_grouped_conv2d_xdl_int8_instances<Empty_Tuple,
|
||||
Empty_Tuple,
|
||||
Relu_Mul_Clamp,
|
||||
ConvFwd1x1S1P0,
|
||||
16>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,68 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_xdl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_bias_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
I32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Add_Relu_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwdDefault>{});
|
||||
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Relu_Mul_Clamp, ConvFwd1x1P0>{});
|
||||
|
||||
add_device_operation_instances(instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple,
|
||||
I32_Tuple,
|
||||
Add_Relu_Mul_Clamp,
|
||||
ConvFwd1x1S1P0>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,111 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using Empty_Tuple = ck::Tuple<>;
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using GNHWC = ck::tensor_layout::convolution::GNHWC;
|
||||
using GKYXC = ck::tensor_layout::convolution::GKYXC;
|
||||
using GNHWK = ck::tensor_layout::convolution::GNHWK;
|
||||
using GK = ck::tensor_layout::convolution::G_K;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Relu = ck::tensor_operation::element_wise::Relu;
|
||||
|
||||
using GK_Tuple = ck::Tuple<GK>;
|
||||
using GK_GK_Tuple = ck::Tuple<GK, GK>;
|
||||
using I32_Tuple = ck::Tuple<int32_t>;
|
||||
using F32_Tuple = ck::Tuple<float>;
|
||||
using I32_F32_Tuple = ck::Tuple<int32_t, float>;
|
||||
|
||||
using Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
|
||||
using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
|
||||
|
||||
using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
|
||||
|
||||
using Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<PassThrough>;
|
||||
using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Relu>;
|
||||
|
||||
using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Relu>;
|
||||
|
||||
static constexpr ck::index_t NDimSpatial = 2;
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
static constexpr auto ConvFwdDefault =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
static constexpr auto ConvFwd1x1P0 =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
|
||||
static constexpr auto ConvFwd1x1S1P0 =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
|
||||
|
||||
template <typename DsLayout,
|
||||
typename DsDatatype,
|
||||
typename OutElementOp,
|
||||
ConvolutionForwardSpecialization ConvSpec>
|
||||
// clang-format off
|
||||
using device_conv2d_int8_instances =
|
||||
std::tuple <
|
||||
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
|
||||
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
|
||||
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
|
||||
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 16>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 16>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
// for conv + multiple of 32 bit Ds. bit of Ds will affect the ScalarPerVector of C
|
||||
template <typename DsLayout,
|
||||
typename DsDatatype,
|
||||
typename OutElementOp,
|
||||
ConvolutionForwardSpecialization ConvSpec>
|
||||
// clang-format off
|
||||
using device_conv2d_int8_32Ds_instances =
|
||||
std::tuple <
|
||||
//########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
|
||||
//########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
|
||||
//########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
|
||||
//########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>,
|
||||
DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< 2, GNHWC, GKYXC, DsLayout, GNHWK, int8_t, int8_t, int32_t, int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmSpec, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,62 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_xdl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_relu_perchannel_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
GK_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
F32_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Relu_Mul2_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwd1x1S1P0>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,62 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_conv2d_xdl_int8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
void add_device_conv2d_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwd1x1S1P0>{});
|
||||
}
|
||||
|
||||
void add_device_conv2d_relu_perlayer_quantization_int8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
|
||||
GNHWC,
|
||||
GKYXC,
|
||||
Empty_Tuple,
|
||||
GNHWK,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Relu_Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwdDefault>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwd1x1P0>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwd1x1S1P0>{});
|
||||
}
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,59 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gemm_quantization_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_dl.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
template <typename OutElementOp>
|
||||
using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//####################| A| B| Ds| E| AData| BData| AccData| DsData| EData| A| B| CDE| GEMM| Block| MPer| NPer| KPer| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
|
||||
//####################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
|
||||
//####################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
|
||||
//####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Dl< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 4>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 4>, S<2, 1, 4, 4>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementOp>
|
||||
using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//####################| A| B| Ds| E| AData| BData| AccData| DsData| EData| A| B| CDE| GEMM| Block| MPer| NPer| KPer| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
|
||||
//####################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
|
||||
//####################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
|
||||
//####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Dl< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<2, 1, 4, 4>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 4>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementOp>
|
||||
using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//####################| A| B| Ds| E| AData| BData| AccData| DsData| EData| A| B| CDE| GEMM| Block| MPer| NPer| KPer| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
|
||||
//####################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
|
||||
//####################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
|
||||
//####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Dl< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<2, 1, 4, 4>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementOp>
|
||||
using device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//####################| A| B| Ds| E| AData| BData| AccData| DsData| EData| A| B| CDE| GEMM| Block| MPer| NPer| KPer| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer|
|
||||
//####################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | ThreadM| Thread| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
|
||||
//####################| | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | |
|
||||
//####################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Dl< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 256, 128, 128, 16, 4, 4, 4, 1, S<8, 2>, S<8, 2>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<8, 1, 1, 4>, S<2, 1, 128, 1>, S<1, 2, 0, 3>, S<1, 2, 0, 3>, S<4, 1, 1, 4>, S<1, 2, 0, 3>, S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>, 5, 4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,32 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Col, Row, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,32 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Col, Col, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,32 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Row, Row, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,32 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_dl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Row, Col, Row]
|
||||
void add_device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,116 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "gemm_quantization_common.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
|
||||
using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//##############################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| LoopScheduler| Pipeline|
|
||||
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
|
||||
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
|
||||
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 4, 4, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 4, 4, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 4, 4, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 4, 4, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 4, 4, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 4, 4, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 4, 4, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 4, 4, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
|
||||
using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//##############################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| LoopScheduler| Pipeline|
|
||||
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
|
||||
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
|
||||
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 4, 16, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 4, 16, 32, 32, 2, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 4, 16, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 4, 16, 32, 32, 2, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 4, 16, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 4, 16, 32, 32, 2, 2, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 4, 16, 32, 32, 2, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 4, 16, 32, 32, 1, 2, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
|
||||
using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//##############################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| LoopScheduler| Pipeline|
|
||||
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
|
||||
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
|
||||
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 16, 4, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 16, 4, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 16, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 16, 4, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 16, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 16, 4, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 16, 4, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 4, 0, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 64, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementOp, LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
|
||||
using device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances = std::tuple<
|
||||
// clang-format off
|
||||
//##############################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| LoopScheduler| Pipeline|
|
||||
//##############################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| | |
|
||||
//##############################| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | |
|
||||
//##############################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 256, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 256, 64, 16, 16, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 128, 64, 16, 16, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 64, 128, 64, 16, 16, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 64, 64, 64, 64, 16, 16, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 128, 64, 64, 16, 16, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 256, 64, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 128, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 128, 32, 128, 64, 16, 16, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 64, 64, 32, 64, 16, 16, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>,
|
||||
DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Empty_Tuple, Row, int8_t, int8_t, int32_t, int32_t, Empty_Tuple, int8_t, PassThrough, PassThrough, OutElementOp, MNKPadding, 1, 64, 32, 64, 64, 16, 16, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 16, GemmLoopScheduler, GemmPipeline>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,49 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Col, Row, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v1>{});
|
||||
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Interwave,
|
||||
PipelineVersion::v1>{});
|
||||
#endif
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v2>{});
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,49 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Col, Col, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v1>{});
|
||||
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Interwave,
|
||||
PipelineVersion::v1>{});
|
||||
#endif
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v2>{});
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,49 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Row, Row, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Row,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v1>{});
|
||||
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Interwave,
|
||||
PipelineVersion::v1>{});
|
||||
#endif
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v2>{});
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,49 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
// Layout(A, B, C) = [Row, Col, Row]
|
||||
void add_device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
|
||||
std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
|
||||
Col,
|
||||
Empty_Tuple,
|
||||
Row,
|
||||
int8_t,
|
||||
int8_t,
|
||||
Empty_Tuple,
|
||||
int8_t,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Mul_Clamp>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v1>{});
|
||||
#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Interwave,
|
||||
PipelineVersion::v1>{});
|
||||
#endif
|
||||
#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances<Mul_Clamp,
|
||||
LoopScheduler::Default,
|
||||
PipelineVersion::v2>{});
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,41 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using Row = ck::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using Empty_Tuple = ck::Tuple<>;
|
||||
using Row_Row_Tuple = ck::Tuple<Row, Row>;
|
||||
using Col_Col_Tuple = ck::Tuple<Col, Col>;
|
||||
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using Relu = ck::tensor_operation::element_wise::Relu;
|
||||
|
||||
using Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
|
||||
using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
|
||||
|
||||
using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
|
||||
using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
|
||||
|
||||
static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user