refactored deviceBatchedGemm; removed GridwiseBatchedGemm; added fp32 and int8 to profiler (#120)

changed long_index_t to index_t when computing memory offset

uncomment other ops in profiler

added test for batched_gemm
This commit is contained in:
Jianfeng Yan
2022-03-21 16:45:14 -05:00
committed by GitHub
parent 485ea46a40
commit cb87b049de
23 changed files with 1309 additions and 896 deletions

View File

@@ -1,3 +1,4 @@
#include <cstdint>
#include <iostream>
#include <numeric>
#include <initializer_list>
@@ -29,8 +30,9 @@ enum GemmMatrixLayout
enum GemmDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
F32_F32_F32, // 0
F16_F16_F16, // 1
Int8_Int8_Int8, // 2
};
int profile_batched_gemm(int argc, char* argv[])
@@ -38,7 +40,7 @@ int profile_batched_gemm(int argc, char* argv[])
if(!(argc == 15))
{
printf("arg1: tensor operation (batched_gemm: Batched GEMM)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg2: data type (0: fp32; 1: fp16, 2: int8)\n");
printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
printf(" 1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
printf(" 2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
@@ -146,6 +148,163 @@ int profile_batched_gemm(int argc, char* argv[])
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_batched_gemm_impl<float,
float,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC,
BatchCount);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_batched_gemm_impl<float,
float,
float,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC,
BatchCount);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_batched_gemm_impl<float,
float,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_batched_gemm_impl<float,
float,
float,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::Int8_Int8_Int8 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_batched_gemm_impl<int8_t,
int8_t,
int8_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC,
BatchCount);
}
else if(data_type == GemmDataType::Int8_Int8_Int8 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_batched_gemm_impl<int8_t,
int8_t,
int8_t,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC,
BatchCount);
}
else if(data_type == GemmDataType::Int8_Int8_Int8 && layout == GemmMatrixLayout::KM_KN_MN)
{
ck::profiler::profile_batched_gemm_impl<int8_t,
int8_t,
int8_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC);
}
else if(data_type == GemmDataType::Int8_Int8_Int8 && layout == GemmMatrixLayout::KM_NK_MN)
{
ck::profiler::profile_batched_gemm_impl<int8_t,
int8_t,
int8_t,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
nrepeat,
M,
N,
K,
(StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC,
BatchCount);
}
else
{
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");