mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 14:29:05 +00:00
[CK][EXAMPLES] (#2826)
-Added parameter to enable/disable verification and timing of kernel in various examples that missed it. -Added parameter to change number of groups to execute in grouped_gemm_examples. Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
This commit is contained in:
committed by
GitHub
parent
f3239395dc
commit
ffe9775e70
@@ -278,6 +278,30 @@ bool run_grouped_gemm_example(int argc, char* argv[])
|
||||
|
||||
problem_size.group_count = 16;
|
||||
|
||||
if(argc == 4)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 6)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
config.async_hargs = std::stoi(argv[4]);
|
||||
problem_size.group_count = std::stoi(argv[5]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4: async hargs (0=n0, 1=yes)\n");
|
||||
printf("arg5: group count (default=16)");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
for(int i = 0; i < problem_size.group_count; i++)
|
||||
{
|
||||
problem_size.Ms.push_back(256 + 256 * i);
|
||||
@@ -288,27 +312,6 @@ bool run_grouped_gemm_example(int argc, char* argv[])
|
||||
problem_size.stride_Bs.push_back(problem_size.Ks[i]);
|
||||
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
|
||||
}
|
||||
if(argc == 4)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 5)
|
||||
{
|
||||
config.do_verification = std::stoi(argv[1]);
|
||||
config.init_method = std::stoi(argv[2]);
|
||||
config.time_kernel = std::stoi(argv[3]);
|
||||
config.async_hargs = std::stoi(argv[4]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4: async hargs (0=n0, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
return run_grouped_gemm(problem_size, config);
|
||||
}
|
||||
|
||||
@@ -236,7 +236,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
|
||||
<< " GB/s, " << std::endl;
|
||||
}
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 1024;
|
||||
@@ -249,6 +249,25 @@ int main()
|
||||
ck::index_t StrideD1 = 1024;
|
||||
ck::index_t StrideE = 1024;
|
||||
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = static_cast<bool>(std::stoi(argv[2]));
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
|
||||
Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
|
||||
@@ -357,6 +376,7 @@ int main()
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
bool pass = true;
|
||||
if(do_verification)
|
||||
{
|
||||
// verification
|
||||
Tensor<LayerNormOutDataType> host_layerNorm_m_n(
|
||||
@@ -383,27 +403,25 @@ int main()
|
||||
1e-2);
|
||||
}
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
// evaluate kernel perf
|
||||
bool time_kernel = true;
|
||||
|
||||
float gemm_reduce_mean_reduce_square_mean_ave_time =
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
|
||||
float normalize_ave_time =
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
if(time_kernel)
|
||||
DumpGemmLayerNormPerf<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
D0DataType,
|
||||
D1DataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType>(
|
||||
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
|
||||
DumpGemmLayerNormPerf<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
D0DataType,
|
||||
D1DataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType>(
|
||||
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
|
||||
}
|
||||
|
||||
return pass ? 0 : 1;
|
||||
|
||||
@@ -221,7 +221,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
|
||||
<< " GB/s, " << std::endl;
|
||||
}
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
// GEMM shape
|
||||
ck::index_t M = 1024;
|
||||
@@ -232,6 +232,25 @@ int main()
|
||||
ck::index_t StrideB = 1024;
|
||||
ck::index_t StrideE = 1024;
|
||||
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = static_cast<bool>(std::stoi(argv[2]));
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
|
||||
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
|
||||
@@ -333,6 +352,7 @@ int main()
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
bool pass = true;
|
||||
if(do_verification)
|
||||
{
|
||||
// verification
|
||||
Tensor<LayerNormOutDataType> host_layerNorm_m_n(
|
||||
@@ -354,25 +374,23 @@ int main()
|
||||
layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
|
||||
}
|
||||
|
||||
if(time_kernel)
|
||||
{
|
||||
// evaluate kernel perf
|
||||
bool time_kernel = true;
|
||||
|
||||
float gemm_reduce_mean_reduce_square_mean_ave_time =
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
|
||||
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
|
||||
float normalize_ave_time =
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
|
||||
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
|
||||
|
||||
if(time_kernel)
|
||||
DumpGemmLayerNormPerf<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType>(
|
||||
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
|
||||
DumpGemmLayerNormPerf<ADataType,
|
||||
BDataType,
|
||||
EDataType,
|
||||
R0DataType,
|
||||
R1DataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType>(
|
||||
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
|
||||
}
|
||||
|
||||
return pass ? 0 : 1;
|
||||
|
||||
@@ -194,22 +194,28 @@ int main(int argc, char* argv[])
|
||||
int init_method = 1;
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 4)
|
||||
std::size_t group_count = rand() % 16 + 1;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 5)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
init_method = std::stoi(argv[2]);
|
||||
time_kernel = std::stoi(argv[3]);
|
||||
group_count = std::stoi(argv[4]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
|
||||
printf("arg3: time kernel (0=n0, 1=yes)\n");
|
||||
printf("arg4: group count (default = random from 1..16)");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::size_t group_count = rand() % 16 + 1;
|
||||
|
||||
// GEMM shape
|
||||
std::vector<ck::tensor_operation::device::ContractionDesc<1>> contraction_descs;
|
||||
std::vector<const void*> p_a, p_b;
|
||||
|
||||
@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
|
||||
|
||||
#include "run_permute_element_example.inc"
|
||||
|
||||
int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 2)
|
||||
{
|
||||
time_kernel = std::stoi(argv[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}, time_kernel);
|
||||
}
|
||||
|
||||
@@ -19,4 +19,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
|
||||
|
||||
#include "run_permute_bundle_example.inc"
|
||||
|
||||
int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 2)
|
||||
{
|
||||
time_kernel = std::stoi(argv[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}, time_kernel);
|
||||
}
|
||||
|
||||
@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
|
||||
|
||||
#include "run_permute_element_example.inc"
|
||||
|
||||
int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 2)
|
||||
{
|
||||
time_kernel = std::stoi(argv[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
return !run_permute_element_example({121, 768, 80}, {0, 2, 1}, time_kernel);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
bool run_permute_bundle(const Problem& problem)
|
||||
bool run_permute_bundle(const Problem& problem, bool time_kernel)
|
||||
{
|
||||
const auto& input_bundle_shape = problem.shape;
|
||||
const auto& input_bundle_axes = problem.axes;
|
||||
@@ -41,7 +41,7 @@ bool run_permute_bundle(const Problem& problem)
|
||||
};
|
||||
|
||||
auto invoker = permute.MakeInvoker();
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms" << std::endl;
|
||||
|
||||
@@ -72,7 +72,9 @@ bool run_permute_bundle(const Problem& problem)
|
||||
1e-6);
|
||||
}
|
||||
|
||||
bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
|
||||
bool run_permute_bundle_example(const Problem::Shape& shape,
|
||||
const Problem::Axes& axes,
|
||||
bool time_kernel)
|
||||
{
|
||||
return run_permute_bundle(Problem{shape, axes});
|
||||
return run_permute_bundle(Problem{shape, axes}, time_kernel);
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
bool run_permute_element(const Problem& problem)
|
||||
bool run_permute_element(const Problem& problem, bool time_kernel)
|
||||
{
|
||||
const auto& input_shape = problem.shape;
|
||||
const auto& input_axes = problem.axes;
|
||||
@@ -40,7 +40,7 @@ bool run_permute_element(const Problem& problem)
|
||||
};
|
||||
|
||||
auto invoker = permute.MakeInvoker();
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
|
||||
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
|
||||
|
||||
std::cout << "Perf: " << ave_time << " ms" << std::endl;
|
||||
|
||||
@@ -59,7 +59,9 @@ bool run_permute_element(const Problem& problem)
|
||||
1e-6);
|
||||
}
|
||||
|
||||
bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
|
||||
bool run_permute_element_example(const Problem::Shape& shape,
|
||||
const Problem::Axes& axes,
|
||||
bool time_kernel)
|
||||
{
|
||||
return run_permute_element(Problem{shape, axes});
|
||||
return run_permute_element(Problem{shape, axes}, time_kernel);
|
||||
}
|
||||
|
||||
@@ -78,8 +78,28 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(
|
||||
out_element_op, do_verification, time_kernel);
|
||||
};
|
||||
|
||||
@@ -76,9 +76,28 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -79,9 +79,29 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
float scale_z_inv = 0.5f;
|
||||
const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(
|
||||
out_element_op, do_verification, time_kernel);
|
||||
};
|
||||
|
||||
@@ -76,10 +76,29 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
float scale_acc = 0.5f;
|
||||
float scale_z_inv = 0.5f;
|
||||
const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -76,8 +76,27 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -71,9 +71,28 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -82,8 +82,28 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_bias_perchannel_quantization_example(
|
||||
out_element_op, do_verification, time_kernel);
|
||||
};
|
||||
|
||||
@@ -80,9 +80,28 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -80,8 +80,27 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const auto out_element_op = OutElementOp{ActivationOp{}};
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -75,9 +75,28 @@ using DeviceGroupedConvNDFwdInstance =
|
||||
|
||||
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
float requant_scale = 0.5f;
|
||||
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
|
||||
run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
|
||||
}
|
||||
|
||||
@@ -167,10 +167,10 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
|
||||
int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op,
|
||||
bool do_verification,
|
||||
bool time_kernel)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
const ck::index_t ndim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
|
||||
@@ -155,10 +155,10 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
|
||||
int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op,
|
||||
bool do_verification,
|
||||
bool time_kernel)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
const ck::index_t ndim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
|
||||
@@ -157,10 +157,10 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
|
||||
int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op,
|
||||
bool do_verification,
|
||||
bool time_kernel)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
const ck::index_t ndim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
|
||||
@@ -139,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
|
||||
return (pass ? 0 : 1);
|
||||
}
|
||||
|
||||
int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
|
||||
int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op,
|
||||
bool do_verification,
|
||||
bool time_kernel)
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = false;
|
||||
const ck::index_t ndim_spatial = 2;
|
||||
|
||||
ck::utils::conv::ConvParam conv_param{
|
||||
|
||||
@@ -11,21 +11,36 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
|
||||
ck::index_t G = 64;
|
||||
ck::index_t C = 128;
|
||||
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
bool log_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default case
|
||||
}
|
||||
else if(argc == 6)
|
||||
else if(argc == 4)
|
||||
{
|
||||
N = std::stoi(argv[1]);
|
||||
H = std::stoi(argv[2]);
|
||||
W = std::stoi(argv[3]);
|
||||
G = std::stoi(argv[4]);
|
||||
C = std::stoi(argv[5]);
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
log_kernel = std::stoi(argv[3]);
|
||||
}
|
||||
else if(argc == 9)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
log_kernel = std::stoi(argv[3]);
|
||||
N = std::stoi(argv[4]);
|
||||
H = std::stoi(argv[5]);
|
||||
W = std::stoi(argv[6]);
|
||||
G = std::stoi(argv[7]);
|
||||
C = std::stoi(argv[8]);
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
|
||||
std::cerr << "arg1 = verify(0=no, 1=yes), arg2 = time kernels(0=no, 1=yes), arg3 = log "
|
||||
"kernels(0=no, 1=yes), arg4 to 8: N, H, W, G, C"
|
||||
<< std::endl;
|
||||
|
||||
return 1;
|
||||
}
|
||||
@@ -94,7 +109,8 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
|
||||
device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
|
||||
|
||||
auto invoker_ptr = device_instance.MakeInvokerPointer();
|
||||
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
|
||||
float ave_time =
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_kernel});
|
||||
|
||||
std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
|
||||
sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
|
||||
@@ -106,6 +122,7 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
|
||||
<< device_instance.GetTypeString() << std::endl;
|
||||
|
||||
bool pass = true;
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<YDataType> host_y({N, H, W, G, C});
|
||||
Tensor<SaveMeanInvStdDataType> host_save_mean(HostTensorDescriptor{N, G});
|
||||
|
||||
@@ -44,11 +44,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<8, 8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<8>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 128, 32, 64};
|
||||
std::array<ck::index_t, 4> ab_lengths;
|
||||
std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
|
||||
|
||||
@@ -37,11 +37,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<8>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 128, 32, 64};
|
||||
std::vector<std::size_t> nhwc = {16, 32, 64, 128};
|
||||
|
||||
|
||||
@@ -41,11 +41,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<8>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 8, 32, 64};
|
||||
std::vector<std::size_t> nhwc = {16, 32, 64, 8};
|
||||
std::array<ck::index_t, 4> ab_lengths;
|
||||
|
||||
@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<8>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 128, 32, 64};
|
||||
std::vector<std::size_t> nhwc = {16, 32, 64, 128};
|
||||
|
||||
|
||||
@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<1>, // InScalarPerVectorSeq
|
||||
ck::Sequence<1>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 8, 32, 64};
|
||||
std::vector<std::size_t> nhwc = {16, 32, 64, 8};
|
||||
std::array<ck::index_t, 4> ab_lengths;
|
||||
|
||||
@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<8>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 128, 32, 64};
|
||||
std::vector<std::size_t> nhwc = {16, 32, 64, 128};
|
||||
|
||||
|
||||
@@ -119,6 +119,22 @@ int main(int argc, char* argv[])
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
const float scale = 2.f;
|
||||
|
||||
ck::index_t M = 1024;
|
||||
|
||||
@@ -48,11 +48,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
|
||||
ck::Sequence<8, 8, 8>, // InScalarPerVectorSeq
|
||||
ck::Sequence<8>>; // OutScalarPerVectorSeq
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
std::vector<std::size_t> nchw = {16, 128, 32, 64};
|
||||
std::array<ck::index_t, 4> ab_lengths;
|
||||
std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
|
||||
|
||||
@@ -77,9 +77,26 @@ void host_elementwise2D(HostTensorC& C,
|
||||
}
|
||||
}
|
||||
|
||||
int main()
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
bool time_kernel = true;
|
||||
bool do_verification = true;
|
||||
bool time_kernel = true;
|
||||
|
||||
if(argc == 1)
|
||||
{
|
||||
// use default
|
||||
}
|
||||
else if(argc == 3)
|
||||
{
|
||||
do_verification = std::stoi(argv[1]);
|
||||
time_kernel = std::stoi(argv[2]);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("arg1: verification (0=no, 1=yes)\n");
|
||||
printf("arg2: time kernel (0=no, 1=yes)\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
ck::index_t M = 48 * 256;
|
||||
ck::index_t N = 1024;
|
||||
@@ -157,6 +174,7 @@ int main()
|
||||
std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
|
||||
|
||||
bool pass = true;
|
||||
if(do_verification)
|
||||
{
|
||||
std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
|
||||
static_cast<unsigned long>(N)};
|
||||
|
||||
Reference in New Issue
Block a user