[CK][EXAMPLES] (#2826)

-Added parameter to enable/disable verification and timing of kernel in various examples that missed it.
-Added parameter to change number of groups to execute in grouped_gemm_examples.

Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
This commit is contained in:
Michał Kulikowski
2025-09-11 21:33:00 +02:00
committed by GitHub
parent f3239395dc
commit ffe9775e70
33 changed files with 578 additions and 116 deletions

View File

@@ -278,6 +278,30 @@ bool run_grouped_gemm_example(int argc, char* argv[])
problem_size.group_count = 16;
if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc == 6)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.async_hargs = std::stoi(argv[4]);
problem_size.group_count = std::stoi(argv[5]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: async hargs (0=n0, 1=yes)\n");
printf("arg5: group count (default=16)");
exit(0);
}
for(int i = 0; i < problem_size.group_count; i++)
{
problem_size.Ms.push_back(256 + 256 * i);
@@ -288,27 +312,6 @@ bool run_grouped_gemm_example(int argc, char* argv[])
problem_size.stride_Bs.push_back(problem_size.Ks[i]);
problem_size.stride_Cs.push_back(problem_size.Ns[i]);
}
if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc == 5)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
config.async_hargs = std::stoi(argv[4]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: async hargs (0=n0, 1=yes)\n");
exit(0);
}
return run_grouped_gemm(problem_size, config);
}

View File

@@ -236,7 +236,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
<< " GB/s, " << std::endl;
}
int main()
int main(int argc, char* argv[])
{
// GEMM shape
ck::index_t M = 1024;
@@ -249,6 +249,25 @@ int main()
ck::index_t StrideD1 = 1024;
ck::index_t StrideE = 1024;
bool do_verification = true;
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = static_cast<bool>(std::stoi(argv[2]));
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
@@ -357,6 +376,7 @@ int main()
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
bool pass = true;
if(do_verification)
{
// verification
Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -383,27 +403,25 @@ int main()
1e-2);
}
if(time_kernel)
{
// evaluate kernel perf
bool time_kernel = true;
float gemm_reduce_mean_reduce_square_mean_ave_time =
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
float normalize_ave_time =
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
if(time_kernel)
DumpGemmLayerNormPerf<ADataType,
BDataType,
EDataType,
D0DataType,
D1DataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>(
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
DumpGemmLayerNormPerf<ADataType,
BDataType,
EDataType,
D0DataType,
D1DataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>(
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
}
return pass ? 0 : 1;

View File

@@ -221,7 +221,7 @@ void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M,
<< " GB/s, " << std::endl;
}
int main()
int main(int argc, char* argv[])
{
// GEMM shape
ck::index_t M = 1024;
@@ -232,6 +232,25 @@ int main()
ck::index_t StrideB = 1024;
ck::index_t StrideE = 1024;
bool do_verification = true;
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = static_cast<bool>(std::stoi(argv[2]));
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
@@ -333,6 +352,7 @@ int main()
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
bool pass = true;
if(do_verification)
{
// verification
Tensor<LayerNormOutDataType> host_layerNorm_m_n(
@@ -354,25 +374,23 @@ int main()
layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
}
if(time_kernel)
{
// evaluate kernel perf
bool time_kernel = true;
float gemm_reduce_mean_reduce_square_mean_ave_time =
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, true});
float normalize_ave_time =
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, true});
if(time_kernel)
DumpGemmLayerNormPerf<ADataType,
BDataType,
EDataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>(
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
DumpGemmLayerNormPerf<ADataType,
BDataType,
EDataType,
R0DataType,
R1DataType,
GammaDataType,
BetaDataType,
LayerNormOutDataType>(
gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
}
return pass ? 0 : 1;

View File

@@ -194,22 +194,28 @@ int main(int argc, char* argv[])
int init_method = 1;
bool time_kernel = false;
if(argc == 4)
std::size_t group_count = rand() % 16 + 1;
if(argc == 1)
{
// use default
}
else if(argc == 5)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
group_count = std::stoi(argv[4]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4: group count (default = random from 1..16)");
exit(0);
}
std::size_t group_count = rand() % 16 + 1;
// GEMM shape
std::vector<ck::tensor_operation::device::ContractionDesc<1>> contraction_descs;
std::vector<const void*> p_a, p_b;

View File

@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
#include "run_permute_element_example.inc"
int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
int main(int argc, char* argv[])
{
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 2)
{
time_kernel = std::stoi(argv[1]);
}
else
{
printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
exit(0);
}
return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}, time_kernel);
}

View File

@@ -19,4 +19,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
#include "run_permute_bundle_example.inc"
int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
int main(int argc, char* argv[])
{
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 2)
{
time_kernel = std::stoi(argv[1]);
}
else
{
printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
exit(0);
}
return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}, time_kernel);
}

View File

@@ -17,4 +17,23 @@ using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
#include "run_permute_element_example.inc"
int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
int main(int argc, char* argv[])
{
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 2)
{
time_kernel = std::stoi(argv[1]);
}
else
{
printf("arg1: time kernel (0=no, 1=yes, default=0)\n");
exit(0);
}
return !run_permute_element_example({121, 768, 80}, {0, 2, 1}, time_kernel);
}

View File

@@ -3,7 +3,7 @@
#pragma once
bool run_permute_bundle(const Problem& problem)
bool run_permute_bundle(const Problem& problem, bool time_kernel)
{
const auto& input_bundle_shape = problem.shape;
const auto& input_bundle_axes = problem.axes;
@@ -41,7 +41,7 @@ bool run_permute_bundle(const Problem& problem)
};
auto invoker = permute.MakeInvoker();
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::cout << "Perf: " << ave_time << " ms" << std::endl;
@@ -72,7 +72,9 @@ bool run_permute_bundle(const Problem& problem)
1e-6);
}
bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
bool run_permute_bundle_example(const Problem::Shape& shape,
const Problem::Axes& axes,
bool time_kernel)
{
return run_permute_bundle(Problem{shape, axes});
return run_permute_bundle(Problem{shape, axes}, time_kernel);
}

View File

@@ -3,7 +3,7 @@
#pragma once
bool run_permute_element(const Problem& problem)
bool run_permute_element(const Problem& problem, bool time_kernel)
{
const auto& input_shape = problem.shape;
const auto& input_axes = problem.axes;
@@ -40,7 +40,7 @@ bool run_permute_element(const Problem& problem)
};
auto invoker = permute.MakeInvoker();
float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::cout << "Perf: " << ave_time << " ms" << std::endl;
@@ -59,7 +59,9 @@ bool run_permute_element(const Problem& problem)
1e-6);
}
bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
bool run_permute_element_example(const Problem::Shape& shape,
const Problem::Axes& axes,
bool time_kernel)
{
return run_permute_element(Problem{shape, axes});
return run_permute_element(Problem{shape, axes}, time_kernel);
}

View File

@@ -78,8 +78,28 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
const auto out_element_op = OutElementOp{ActivationOp{}};
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
run_conv2d_fwd_bias_perchannel_quantization_example(
out_element_op, do_verification, time_kernel);
};

View File

@@ -76,9 +76,28 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
float requant_scale = 0.5f;
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -79,9 +79,29 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
float scale_z_inv = 0.5f;
const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}};
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
run_conv2d_fwd_bias_perchannel_quantization_example(
out_element_op, do_verification, time_kernel);
};

View File

@@ -76,10 +76,29 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
float scale_acc = 0.5f;
float scale_z_inv = 0.5f;
const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}};
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -76,8 +76,27 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
const auto out_element_op = OutElementOp{ActivationOp{}};
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -71,9 +71,28 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
float requant_scale = 0.5f;
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -82,8 +82,28 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
const auto out_element_op = OutElementOp{ActivationOp{}};
run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op);
run_conv2d_fwd_bias_perchannel_quantization_example(
out_element_op, do_verification, time_kernel);
};

View File

@@ -80,9 +80,28 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
float requant_scale = 0.5f;
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op);
run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -80,8 +80,27 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perchannel_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
const auto out_element_op = OutElementOp{ActivationOp{}};
run_conv2d_fwd_perchannel_quantization_example(out_element_op);
run_conv2d_fwd_perchannel_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -75,9 +75,28 @@ using DeviceGroupedConvNDFwdInstance =
#include "run_conv2d_fwd_perlayer_quantization_example.inc"
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = false;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
float requant_scale = 0.5f;
const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}};
run_conv2d_fwd_perlayer_quantization_example(out_element_op);
run_conv2d_fwd_perlayer_quantization_example(out_element_op, do_verification, time_kernel);
}

View File

@@ -167,10 +167,10 @@ bool run_grouped_conv_fwd(bool do_verification,
return (pass ? 0 : 1);
}
int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op)
int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op,
bool do_verification,
bool time_kernel)
{
bool do_verification = true;
bool time_kernel = true;
const ck::index_t ndim_spatial = 2;
ck::utils::conv::ConvParam conv_param{

View File

@@ -155,10 +155,10 @@ bool run_grouped_conv_fwd(bool do_verification,
return (pass ? 0 : 1);
}
int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op)
int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op,
bool do_verification,
bool time_kernel)
{
bool do_verification = true;
bool time_kernel = true;
const ck::index_t ndim_spatial = 2;
ck::utils::conv::ConvParam conv_param{

View File

@@ -157,10 +157,10 @@ bool run_grouped_conv_fwd(bool do_verification,
return (pass ? 0 : 1);
}
int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op)
int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op,
bool do_verification,
bool time_kernel)
{
bool do_verification = true;
bool time_kernel = true;
const ck::index_t ndim_spatial = 2;
ck::utils::conv::ConvParam conv_param{

View File

@@ -139,10 +139,10 @@ bool run_grouped_conv_fwd(bool do_verification,
return (pass ? 0 : 1);
}
int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op)
int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op,
bool do_verification,
bool time_kernel)
{
bool do_verification = true;
bool time_kernel = false;
const ck::index_t ndim_spatial = 2;
ck::utils::conv::ConvParam conv_param{

View File

@@ -11,21 +11,36 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
ck::index_t G = 64;
ck::index_t C = 128;
bool do_verification = true;
bool time_kernel = true;
bool log_kernel = true;
if(argc == 1)
{
// use default case
}
else if(argc == 6)
else if(argc == 4)
{
N = std::stoi(argv[1]);
H = std::stoi(argv[2]);
W = std::stoi(argv[3]);
G = std::stoi(argv[4]);
C = std::stoi(argv[5]);
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
log_kernel = std::stoi(argv[3]);
}
else if(argc == 9)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
log_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
H = std::stoi(argv[5]);
W = std::stoi(argv[6]);
G = std::stoi(argv[7]);
C = std::stoi(argv[8]);
}
else
{
std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
std::cerr << "arg1 = verify(0=no, 1=yes), arg2 = time kernels(0=no, 1=yes), arg3 = log "
"kernels(0=no, 1=yes), arg4 to 8: N, H, W, G, C"
<< std::endl;
return 1;
}
@@ -94,7 +109,8 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
device_instance.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
auto invoker_ptr = device_instance.MakeInvokerPointer();
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel, log_kernel});
std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
@@ -106,6 +122,7 @@ int run_groupnorm_fwd_example(int argc, char* argv[])
<< device_instance.GetTypeString() << std::endl;
bool pass = true;
if(do_verification)
{
Tensor<YDataType> host_y({N, H, W, G, C});
Tensor<SaveMeanInvStdDataType> host_save_mean(HostTensorDescriptor{N, G});

View File

@@ -44,11 +44,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<8, 8>, // InScalarPerVectorSeq
ck::Sequence<8>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 128, 32, 64};
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),

View File

@@ -37,11 +37,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<8>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 128, 32, 64};
std::vector<std::size_t> nhwc = {16, 32, 64, 128};

View File

@@ -41,11 +41,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<8>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 8, 32, 64};
std::vector<std::size_t> nhwc = {16, 32, 64, 8};
std::array<ck::index_t, 4> ab_lengths;

View File

@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<8>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 128, 32, 64};
std::vector<std::size_t> nhwc = {16, 32, 64, 128};

View File

@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<1>, // InScalarPerVectorSeq
ck::Sequence<1>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 8, 32, 64};
std::vector<std::size_t> nhwc = {16, 32, 64, 8};
std::array<ck::index_t, 4> ab_lengths;

View File

@@ -40,11 +40,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<8>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 128, 32, 64};
std::vector<std::size_t> nhwc = {16, 32, 64, 128};

View File

@@ -119,6 +119,22 @@ int main(int argc, char* argv[])
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
const float scale = 2.f;
ck::index_t M = 1024;

View File

@@ -48,11 +48,27 @@ using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceEle
ck::Sequence<8, 8, 8>, // InScalarPerVectorSeq
ck::Sequence<8>>; // OutScalarPerVectorSeq
int main()
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
std::vector<std::size_t> nchw = {16, 128, 32, 64};
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> ab_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),

View File

@@ -77,9 +77,26 @@ void host_elementwise2D(HostTensorC& C,
}
}
int main()
int main(int argc, char* argv[])
{
bool time_kernel = true;
bool do_verification = true;
bool time_kernel = true;
if(argc == 1)
{
// use default
}
else if(argc == 3)
{
do_verification = std::stoi(argv[1]);
time_kernel = std::stoi(argv[2]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: time kernel (0=no, 1=yes)\n");
exit(0);
}
ck::index_t M = 48 * 256;
ck::index_t N = 1024;
@@ -157,6 +174,7 @@ int main()
std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
bool pass = true;
if(do_verification)
{
std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
static_cast<unsigned long>(N)};