mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-22 05:48:25 +00:00
Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
This commit is contained in:
@@ -1,81 +0,0 @@
|
||||
## Docker script
|
||||
```bash
|
||||
docker run \
|
||||
-it \
|
||||
--rm \
|
||||
--privileged \
|
||||
--group-add sudo \
|
||||
-w /root/workspace \
|
||||
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
|
||||
rocm/tensorflow:rocm4.3.1-tf2.6-dev \
|
||||
/bin/bash
|
||||
```
|
||||
|
||||
## Build ```ckProfiler```
|
||||
```bash
|
||||
mkdir build && cd build
|
||||
```
|
||||
|
||||
```bash
|
||||
# Need to Specify target ID, example below is gfx908
|
||||
cmake \
|
||||
-D BUILD_DEV=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " \
|
||||
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
|
||||
-D CMAKE_PREFIX_PATH=/opt/rocm \
|
||||
..
|
||||
```
|
||||
|
||||
```bash
|
||||
make -j ckProfiler
|
||||
```
|
||||
|
||||
## Profile GEMM kernels
|
||||
```bash
|
||||
#arg1: tensor operation (gemm=GEMM)
|
||||
#arg2: data type (0=fp32, 1=fp16)
|
||||
#arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)
|
||||
#arg4: verification (0=no, 1=yes)
|
||||
#arg5: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg6: print matrix value (0=no, 1=yes)
|
||||
#arg7: run kernel # of times (>1)
|
||||
#arg8 to 13: M, N, K, StrideA, StrideB, StrideC
|
||||
|
||||
##################### op datatype layout verify init log repeat M___ N___ K___ StrideA StrideB StrideC
|
||||
./profiler/ckProfiler gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
```bash
|
||||
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
|
||||
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
|
||||
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
|
||||
....
|
||||
Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
|
||||
```
|
||||
|
||||
## Profile forward convolution kernels
|
||||
```bash
|
||||
#arg1: tensor operation (conv=Convolution)
|
||||
#arg2: data type (0=fp32, 1=fp16)
|
||||
#arg3: input tensor layout (0=NCHW, 1=NHWC)
|
||||
#arg4: weight tensor layout (0=KCYX, 1=KYXC)
|
||||
#arg5: output tensor layout (0=NKHW, 1=NHWK)
|
||||
#arg6: verification (0=no, 1=yes)
|
||||
#arg7: initialization (0=no init, 1=integer value, 2=decimal value)
|
||||
#arg8: print matrix value (0=no, 1=yes)
|
||||
#arg9: run kernel # of times (>1)
|
||||
#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
|
||||
##################### op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
|
||||
./profiler/ckProfiler conv_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
|
||||
```
|
||||
|
||||
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
|
||||
```
|
||||
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
|
||||
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
|
||||
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
|
||||
....
|
||||
Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
|
||||
```
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
{
|
||||
enum struct GemmMatrixLayout_t
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
MK_KN_MN, // 0
|
||||
MK_NK_MN, // 1
|
||||
@@ -17,7 +17,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
KM_NK_MN, // 3
|
||||
};
|
||||
|
||||
enum struct GemmReduceDataType_t
|
||||
enum struct GemmReduceDataType
|
||||
{
|
||||
F32_F32_F32_F32_F32, // 0
|
||||
F16_F16_F16_F32_F32, // 1
|
||||
@@ -40,8 +40,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const auto data_type = static_cast<GemmReduceDataType_t>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<GemmMatrixLayout_t>(std::stoi(argv[3]));
|
||||
const auto data_type = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
|
||||
const bool do_verification = std::stoi(argv[4]);
|
||||
const int init_method = std::stoi(argv[5]);
|
||||
const bool do_log = std::stoi(argv[6]);
|
||||
@@ -57,8 +57,7 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
|
||||
const int BatchCount = std::stoi(argv[14]);
|
||||
|
||||
if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::MK_KN_MN)
|
||||
if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
|
||||
{
|
||||
ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
@@ -79,8 +78,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
(StrideC < 0) ? N : StrideC,
|
||||
BatchCount);
|
||||
}
|
||||
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::MK_NK_MN)
|
||||
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout::MK_NK_MN)
|
||||
{
|
||||
ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
@@ -101,8 +100,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
(StrideC < 0) ? N : StrideC,
|
||||
BatchCount);
|
||||
}
|
||||
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::KM_KN_MN)
|
||||
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout::KM_KN_MN)
|
||||
{
|
||||
ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
@@ -123,8 +122,8 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
|
||||
(StrideC < 0) ? N : StrideC,
|
||||
BatchCount);
|
||||
}
|
||||
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::KM_NK_MN)
|
||||
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout::KM_NK_MN)
|
||||
{
|
||||
ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
|
||||
#include "profile_convnd_bwd_data_impl.hpp"
|
||||
|
||||
enum ConvDataType
|
||||
enum struct ConvDataType
|
||||
{
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
@@ -15,19 +15,19 @@ enum ConvDataType
|
||||
INT8_INT8_INT8, // 3
|
||||
};
|
||||
|
||||
enum ConvInputLayout
|
||||
enum struct ConvInputLayout
|
||||
{
|
||||
NCHW, // 0
|
||||
NHWC, // 1
|
||||
};
|
||||
|
||||
enum ConvWeightLayout
|
||||
enum struct ConvWeightLayout
|
||||
{
|
||||
KCYX, // 0
|
||||
KYXC, // 1
|
||||
};
|
||||
|
||||
enum ConvOutputLayout
|
||||
enum struct ConvOutputLayout
|
||||
{
|
||||
NKHW, // 0
|
||||
NHWK, // 1
|
||||
@@ -97,10 +97,10 @@ int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
|
||||
const int in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
|
||||
const int wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
|
||||
const int out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
|
||||
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
|
||||
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
|
||||
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
|
||||
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
|
||||
const bool do_verification = std::stoi(argv[6]);
|
||||
const int init_method = std::stoi(argv[7]);
|
||||
const bool do_log = std::stoi(argv[8]);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
int profile_gemm_reduce(int argc, char* argv[])
|
||||
{
|
||||
enum struct GemmMatrixLayout_t
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
MK_KN_MN, // 0
|
||||
MK_NK_MN, // 1
|
||||
@@ -16,7 +16,7 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
KM_NK_MN, // 3
|
||||
};
|
||||
|
||||
enum struct GemmReduceDataType_t
|
||||
enum struct GemmReduceDataType
|
||||
{
|
||||
F32_F32_F32_F32_F32, // 0
|
||||
F16_F16_F16_F32_F32, // 1
|
||||
@@ -39,8 +39,8 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const auto data_type = static_cast<GemmReduceDataType_t>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<GemmMatrixLayout_t>(std::stoi(argv[3]));
|
||||
const auto data_type = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
|
||||
const bool do_verification = std::stoi(argv[4]);
|
||||
const int init_method = std::stoi(argv[5]);
|
||||
const bool do_log = std::stoi(argv[6]);
|
||||
@@ -54,8 +54,7 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
const int StrideB = std::stoi(argv[12]);
|
||||
const int StrideC = std::stoi(argv[13]);
|
||||
|
||||
if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::MK_KN_MN)
|
||||
if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
|
||||
{
|
||||
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
@@ -75,8 +74,8 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
(StrideB < 0) ? N : StrideB,
|
||||
(StrideC < 0) ? N : StrideC);
|
||||
}
|
||||
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::MK_NK_MN)
|
||||
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout::MK_NK_MN)
|
||||
{
|
||||
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
@@ -96,8 +95,8 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
(StrideB < 0) ? K : StrideB,
|
||||
(StrideC < 0) ? N : StrideC);
|
||||
}
|
||||
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::KM_KN_MN)
|
||||
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout::KM_KN_MN)
|
||||
{
|
||||
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
@@ -117,8 +116,8 @@ int profile_gemm_reduce(int argc, char* argv[])
|
||||
(StrideB < 0) ? N : StrideB,
|
||||
(StrideC < 0) ? N : StrideC);
|
||||
}
|
||||
else if(data_type == GemmReduceDataType_t::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout_t::KM_NK_MN)
|
||||
else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
|
||||
layout == GemmMatrixLayout::KM_NK_MN)
|
||||
{
|
||||
ck::profiler::profile_gemm_reduce_impl<ck::half_t,
|
||||
ck::half_t,
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
#include <half.hpp>
|
||||
#include "profile_grouped_gemm_impl.hpp"
|
||||
|
||||
enum GemmMatrixLayout
|
||||
enum struct GemmMatrixLayout
|
||||
{
|
||||
MK_KN_MN, // 0
|
||||
MK_NK_MN, // 1
|
||||
@@ -18,7 +18,7 @@ enum GemmMatrixLayout
|
||||
KM_NK_NM, // 7
|
||||
};
|
||||
|
||||
enum GemmDataType
|
||||
enum struct GemmDataType
|
||||
{
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
@@ -61,8 +61,8 @@ int profile_grouped_gemm(int argc, char* argv[])
|
||||
exit(1);
|
||||
}
|
||||
|
||||
const int data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
|
||||
const int layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
|
||||
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
|
||||
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
|
||||
const bool do_verification = std::stoi(argv[4]);
|
||||
const int init_method = std::stoi(argv[5]);
|
||||
const bool do_log = std::stoi(argv[6]);
|
||||
|
||||
@@ -20,9 +20,9 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
using ck::NanPropagation_t;
|
||||
using ck::ReduceTensorIndices_t;
|
||||
using ck::ReduceTensorOp_t;
|
||||
using ck::NanPropagation;
|
||||
using ck::ReduceTensorIndices;
|
||||
using ck::ReduceTensorOp;
|
||||
|
||||
static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
|
||||
{"reduceDims", required_argument, nullptr, 'R'},
|
||||
@@ -84,7 +84,7 @@ static std::vector<T> getTypeValuesFromString(const char* cstr_values)
|
||||
return (values);
|
||||
}
|
||||
|
||||
enum struct appDataType_t
|
||||
enum struct AppDataType
|
||||
{
|
||||
appHalf = 0,
|
||||
appFloat = 1,
|
||||
@@ -130,18 +130,18 @@ class AppArgs
|
||||
|
||||
std::vector<float> scales;
|
||||
|
||||
ReduceTensorOp_t reduceOp = ReduceTensorOp_t::ADD;
|
||||
appDataType_t compTypeId = appDataType_t::appFloat;
|
||||
appDataType_t outTypeId = appDataType_t::appFloat;
|
||||
ReduceTensorOp reduceOp = ReduceTensorOp::ADD;
|
||||
AppDataType compTypeId = AppDataType::appFloat;
|
||||
AppDataType outTypeId = AppDataType::appFloat;
|
||||
|
||||
bool compType_assigned = false;
|
||||
bool outType_assigned = false;
|
||||
|
||||
NanPropagation_t nanOpt = NanPropagation_t::NOT_PROPAGATE_NAN;
|
||||
ReduceTensorIndices_t indicesOpt = ReduceTensorIndices_t::NO_INDICES;
|
||||
bool do_log = false;
|
||||
bool do_verification = false;
|
||||
bool do_dumpout = false;
|
||||
NanPropagation nanOpt = NanPropagation::NOT_PROPAGATE_NAN;
|
||||
ReduceTensorIndices indicesOpt = ReduceTensorIndices::NO_INDICES;
|
||||
bool do_log = false;
|
||||
bool do_verification = false;
|
||||
bool do_dumpout = false;
|
||||
|
||||
int init_method;
|
||||
int nrepeat;
|
||||
@@ -213,33 +213,33 @@ class AppArgs
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
reduceOp = static_cast<ReduceTensorOp_t>(std::atoi(optarg));
|
||||
reduceOp = static_cast<ReduceTensorOp>(std::atoi(optarg));
|
||||
break;
|
||||
case 'C':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
compTypeId = static_cast<appDataType_t>(std::atoi(optarg));
|
||||
compTypeId = static_cast<AppDataType>(std::atoi(optarg));
|
||||
compType_assigned = true;
|
||||
break;
|
||||
case 'W':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
outTypeId = static_cast<appDataType_t>(std::atoi(optarg));
|
||||
outTypeId = static_cast<AppDataType>(std::atoi(optarg));
|
||||
outType_assigned = true;
|
||||
break;
|
||||
case 'N':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
nanOpt = static_cast<NanPropagation_t>(std::atoi(optarg));
|
||||
nanOpt = static_cast<NanPropagation>(std::atoi(optarg));
|
||||
break;
|
||||
case 'I':
|
||||
if(!optarg)
|
||||
throw std::runtime_error("Invalid option format!");
|
||||
|
||||
indicesOpt = static_cast<ReduceTensorIndices_t>(std::atoi(optarg));
|
||||
indicesOpt = static_cast<ReduceTensorIndices>(std::atoi(optarg));
|
||||
break;
|
||||
case 'S':
|
||||
if(!optarg)
|
||||
@@ -303,10 +303,10 @@ class AppArgs
|
||||
scales.push_back(0.0f);
|
||||
};
|
||||
|
||||
if(reduceOp == ReduceTensorOp_t::MIN || reduceOp == ReduceTensorOp_t::MAX ||
|
||||
reduceOp == ReduceTensorOp_t::AMAX)
|
||||
if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
|
||||
reduceOp == ReduceTensorOp::AMAX)
|
||||
{
|
||||
if(indicesOpt != ReduceTensorIndices_t::NO_INDICES)
|
||||
if(indicesOpt != ReduceTensorIndices::NO_INDICES)
|
||||
need_indices = true;
|
||||
|
||||
// for indexable operations, no need to assign compType and outType, just let them be
|
||||
@@ -333,22 +333,22 @@ int profile_reduce(int argc, char* argv[])
|
||||
|
||||
check_reduce_dims(rank, args.reduceDims);
|
||||
|
||||
if(args.reduceOp == ReduceTensorOp_t::MUL || args.reduceOp == ReduceTensorOp_t::NORM1)
|
||||
if(args.reduceOp == ReduceTensorOp::MUL || args.reduceOp == ReduceTensorOp::NORM1)
|
||||
throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!");
|
||||
|
||||
if(args.use_half)
|
||||
{
|
||||
if(!args.compType_assigned)
|
||||
args.compTypeId = appDataType_t::appHalf;
|
||||
args.compTypeId = AppDataType::appHalf;
|
||||
|
||||
if(args.outType_assigned &&
|
||||
(args.outTypeId != appDataType_t::appHalf && args.outTypeId != appDataType_t::appFloat))
|
||||
args.outTypeId = appDataType_t::appFloat;
|
||||
(args.outTypeId != AppDataType::appHalf && args.outTypeId != AppDataType::appFloat))
|
||||
args.outTypeId = AppDataType::appFloat;
|
||||
|
||||
if(!args.outType_assigned)
|
||||
args.outTypeId = appDataType_t::appHalf;
|
||||
args.outTypeId = AppDataType::appHalf;
|
||||
|
||||
if(args.compTypeId == appDataType_t::appHalf)
|
||||
if(args.compTypeId == AppDataType::appHalf)
|
||||
{
|
||||
profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(args.do_verification,
|
||||
args.init_method,
|
||||
@@ -363,7 +363,7 @@ int profile_reduce(int argc, char* argv[])
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else if(args.compTypeId == appDataType_t::appFloat)
|
||||
else if(args.compTypeId == AppDataType::appFloat)
|
||||
{
|
||||
profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
|
||||
args.init_method,
|
||||
@@ -399,16 +399,16 @@ int profile_reduce(int argc, char* argv[])
|
||||
else if(args.use_int8)
|
||||
{
|
||||
if(!args.compType_assigned)
|
||||
args.compTypeId = appDataType_t::appInt8;
|
||||
args.compTypeId = AppDataType::appInt8;
|
||||
|
||||
if(args.outType_assigned &&
|
||||
(args.outTypeId != appDataType_t::appInt8 && args.outTypeId != appDataType_t::appInt32))
|
||||
args.outTypeId = appDataType_t::appInt32;
|
||||
(args.outTypeId != AppDataType::appInt8 && args.outTypeId != AppDataType::appInt32))
|
||||
args.outTypeId = AppDataType::appInt32;
|
||||
|
||||
if(!args.outType_assigned)
|
||||
args.outTypeId = appDataType_t::appInt8;
|
||||
args.outTypeId = AppDataType::appInt8;
|
||||
|
||||
if(args.compTypeId == appDataType_t::appInt8)
|
||||
if(args.compTypeId == AppDataType::appInt8)
|
||||
{
|
||||
profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
|
||||
args.init_method,
|
||||
@@ -423,7 +423,7 @@ int profile_reduce(int argc, char* argv[])
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else if(args.compTypeId == appDataType_t::appInt32)
|
||||
else if(args.compTypeId == AppDataType::appInt32)
|
||||
{
|
||||
profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
|
||||
args.init_method,
|
||||
@@ -443,12 +443,12 @@ int profile_reduce(int argc, char* argv[])
|
||||
}
|
||||
else if(args.use_bf16)
|
||||
{
|
||||
if(args.outType_assigned && (args.outTypeId != appDataType_t::appBFloat16 &&
|
||||
args.outTypeId != appDataType_t::appFloat))
|
||||
args.outTypeId = appDataType_t::appFloat;
|
||||
if(args.outType_assigned &&
|
||||
(args.outTypeId != AppDataType::appBFloat16 && args.outTypeId != AppDataType::appFloat))
|
||||
args.outTypeId = AppDataType::appFloat;
|
||||
|
||||
if(!args.outType_assigned)
|
||||
args.outTypeId = appDataType_t::appBFloat16;
|
||||
args.outTypeId = AppDataType::appBFloat16;
|
||||
|
||||
profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
|
||||
args.init_method,
|
||||
@@ -465,7 +465,7 @@ int profile_reduce(int argc, char* argv[])
|
||||
}
|
||||
else
|
||||
{
|
||||
if(args.compTypeId == appDataType_t::appFloat)
|
||||
if(args.compTypeId == AppDataType::appFloat)
|
||||
{
|
||||
profile_reduce_impl<float, float, float>(args.do_verification,
|
||||
args.init_method,
|
||||
@@ -480,7 +480,7 @@ int profile_reduce(int argc, char* argv[])
|
||||
args.scales[0],
|
||||
args.scales[1]);
|
||||
}
|
||||
else if(args.compTypeId == appDataType_t::appDouble)
|
||||
else if(args.compTypeId == AppDataType::appDouble)
|
||||
{
|
||||
profile_reduce_impl<float, double, float>(args.do_verification,
|
||||
args.init_method,
|
||||
|
||||
@@ -85,26 +85,24 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
return profile_reduce(argc, argv);
|
||||
}
|
||||
else
|
||||
{
|
||||
// clang-format off
|
||||
printf("arg1: tensor operation (gemm: GEMM\n"
|
||||
" gemm_bias_2d: GEMM+Bias(2D)\n"
|
||||
" gemm_bias_relu: GEMM+Bias+ReLU\n"
|
||||
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
|
||||
" gemm_reduce: GEMM+Reduce\n"
|
||||
" grouped_gemm: Grouped Gemm\n"
|
||||
" conv_fwd: ForwardConvolution\n"
|
||||
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
|
||||
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
|
||||
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
|
||||
" conv1d_bwd_data: BackwardConvolution data 1 dim\n"
|
||||
" conv2d_bwd_data: BackwardConvolution data 2 dim\n"
|
||||
" conv3d_bwd_data: BackwardConvolution data 3 dim\n"
|
||||
" grouped_gemm: Grouped Gemm\n"
|
||||
" reduce: REDUCE\n");
|
||||
// clang-format on
|
||||
|
||||
return 0;
|
||||
}
|
||||
// clang-format off
|
||||
printf("arg1: tensor operation (gemm: GEMM\n"
|
||||
" gemm_bias_2d: GEMM+Bias(2D)\n"
|
||||
" gemm_bias_relu: GEMM+Bias+ReLU\n"
|
||||
" gemm_bias_relu_add: GEMM+Bias+ReLU+Add\n"
|
||||
" gemm_reduce: GEMM+Reduce\n"
|
||||
" grouped_gemm: Grouped GEMM\n"
|
||||
" conv_fwd: ForwardConvolution\n"
|
||||
" conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU\n"
|
||||
" conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add\n"
|
||||
" conv_fwd_bias_relu_atomic_add: ForwardConvolution+Bias+ReLU+AtomicAdd\n"
|
||||
" conv1d_bwd_data: BackwardConvolution data 1d\n"
|
||||
" conv2d_bwd_data: BackwardConvolution data 2d\n"
|
||||
" conv3d_bwd_data: BackwardConvolution data 3d\n"
|
||||
" grouped_gemm: Grouped GEMM\n"
|
||||
" reduce: Reduce\n");
|
||||
// clang-format on
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user