From f2dd2e5b092dc6d3317a90baade0207b2ea1bcdd Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Sat, 12 Nov 2022 01:36:01 +0800 Subject: [PATCH] Rangify constructor of HostTensorDescriptor & Tensor<> (#445) * Rangify STL algorithms This commit adapts rangified std::copy(), std::fill() & std::transform() * Rangify check_err() By rangifying check_err(), we can not only compare values between std::vector<>s, but also compare any ranges which have same value type. * Allow constructing Tensor<> like a HostTensorDescriptor * Simplify Tensor<> object construction logics * Remove more unnecessary 'HostTensorDescriptor' objects * Re-format example code * Re-write more HostTensorDescriptor ctor call [ROCm/composable_kernel commit: 4a2a56c22f75263d70c710950ab7313f072a2523] --- example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp | 12 +-- example/01_gemm/run_gemm_example.inc | 4 +- .../gemm_bilinear_xdl_fp16.cpp | 14 +-- .../gemm_bias_relu_xdl_fp16.cpp | 11 +-- .../run_gemm_add_add_fastgelu_example.inc | 6 +- example/09_convnd_fwd/convnd_fwd_common.hpp | 5 +- .../common.hpp | 12 +-- .../run_convnd_fwd_max_example.inc | 19 ++-- example/12_reduce/reduce_blockwise_impl.hpp | 4 +- .../12_reduce/reduce_blockwise_two_call.cpp | 2 +- .../reduce_multiblock_atomic_add_impl.hpp | 2 +- example/13_pool2d_fwd/pool2d_fwd_common.hpp | 15 ++-- .../gemm_xdl_relu_quantization_int8.cpp | 11 +-- .../grouped_gemm_xdl_bfp16.cpp | 1 + .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp | 1 + .../15_grouped_gemm/grouped_gemm_xdl_fp32.cpp | 1 + .../15_grouped_gemm/grouped_gemm_xdl_int4.cpp | 1 + .../15_grouped_gemm/grouped_gemm_xdl_int8.cpp | 1 + .../run_grouped_gemm_example.inc | 12 +-- .../gemm_add_add_mean_meansquare_xdl_fp16.cpp | 21 ++--- .../gemm_add_addsquare_xdl_int8.cpp | 8 +- .../gemm_reduce_xdl_common.hpp | 21 ++--- .../convnd_bwd_data_common.hpp | 2 +- .../batched_gemm_reduce_xdl_fp16.cpp | 34 ++++--- .../broadcast_add_2d_amn_bn.cpp | 12 +-- .../broadcast_add_3d_am_bmnk.cpp | 13 ++- .../elementwise_add_1d.cpp | 6 +- .../elementwise_add_4d.cpp | 12 +-- .../gemm_bias_relu_add_layernorm_xdl_fp16.cpp | 16 ++-- .../gemm_layernorm_xdl_fp16.cpp | 19 ++-- .../gemm_xdl_layernorm_single_kernel_fp16.cpp | 24 +++-- example/22_cgemm/cgemm_xdl_common.hpp | 25 +++--- example/23_softmax/softmax_blockwise.cpp | 2 +- .../run_batched_gemm_example.inc | 12 +-- .../gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp | 28 ++---- .../gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp | 28 ++---- .../contraction_bilinear_xdl_fp32.cpp | 26 ++---- .../contraction_scale_xdl_fp32.cpp | 22 ++--- example/27_layernorm/layernorm_blockwise.cpp | 12 +-- .../grouped_gemm_bias_e_permute_xdl_fp16.cpp | 26 ++---- .../batched_gemm_bias_e_permute_xdl_fp16.cpp | 28 ++---- ...grouped_conv_fwd_bias_relu_add_example.inc | 6 +- .../batched_gemm_gemm_xdl_bf16.cpp | 1 + .../batched_gemm_gemm_xdl_fp16.cpp | 1 + .../batched_gemm_gemm_xdl_fp32.cpp | 1 + .../batched_gemm_gemm_xdl_int4.cpp | 1 + .../batched_gemm_gemm_xdl_int8.cpp | 1 + .../run_batched_gemm_gemm_example.inc | 10 +-- ...le_scale_softmax_gemm_permute_xdl_fp16.cpp | 1 + ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp | 1 + ...tched_gemm_scale_softmax_gemm_xdl_fp16.cpp | 11 +-- ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp | 1 + ...atched_gemm_scale_softmax_gemm_permute.inc | 4 +- ...rouped_gemm_scale_softmax_gemm_permute.inc | 24 ++--- .../33_multiple_reduce/dual_reduce_common.hpp | 13 +-- .../34_batchnorm/batchnorm_forward_nhwc.cpp | 26 +++--- example/34_batchnorm/batchnorm_infer_nhwc.cpp | 15 ++-- .../run_splitK_gemm_example.inc | 17 ++-- .../sparse_embedding3_forward_layernorm.cpp | 9 +- ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp | 12 +-- ...rouped_conv_bwd_data_bias_relu_example.inc | 4 +- example/39_permute/common.hpp | 18 +--- .../39_permute/run_permute_bundle_example.inc | 2 +- .../grouped_conv_conv_fwd_xdl_bf16.cpp | 1 + .../grouped_conv_conv_fwd_xdl_fp16.cpp | 1 + .../grouped_conv_conv_fwd_xdl_fp32.cpp | 1 + .../grouped_conv_conv_fwd_xdl_int4.cpp | 1 + .../grouped_conv_conv_fwd_xdl_int8.cpp | 1 + .../run_grouped_conv_conv_fwd_example.inc | 4 +- .../42_groupnorm/groupnorm_sigmoid_fp16.cpp | 2 +- .../cpu/reference_gemm_layernorm.hpp | 4 +- .../include/ck/library/utility/algorithm.hpp | 43 +++++++++ .../include/ck/library/utility/check_err.hpp | 89 ++++++++++--------- .../ck/library/utility/host_tensor.hpp | 54 +++++------ .../include/ck/library/utility/iterator.hpp | 22 +++++ library/include/ck/library/utility/ranges.hpp | 60 +++++++++++++ ...le_batched_gemm_add_relu_gemm_add_impl.hpp | 12 +-- .../profile_batched_gemm_gemm_impl.hpp | 12 +-- .../include/profile_batched_gemm_impl.hpp | 12 +-- .../profile_batched_gemm_reduce_impl.hpp | 30 +++---- ...profile_batched_gemm_softmax_gemm_impl.hpp | 12 +-- ...batched_gemm_softmax_gemm_permute_impl.hpp | 5 +- .../include/profile_conv_bwd_data_impl.hpp | 3 +- .../profile_conv_fwd_bias_relu_add_impl.hpp | 15 ++-- .../profile_conv_fwd_bias_relu_impl.hpp | 15 ++-- profiler/include/profile_conv_fwd_impl.hpp | 2 +- .../include/profile_convnd_bwd_data_impl.hpp | 2 +- .../profile_convnd_bwd_weight_impl.hpp | 2 +- .../profile_elementwise_layernorm_impl.hpp | 6 +- .../profile_gemm_add_add_fastgelu_impl.hpp | 15 ++-- .../profile_gemm_bias_add_reduce_impl.hpp | 30 +++---- .../include/profile_gemm_bilinear_impl.hpp | 15 ++-- profiler/include/profile_gemm_impl.hpp | 12 +-- profiler/include/profile_gemm_reduce_impl.hpp | 27 +++--- profiler/include/profile_gemm_splitk_impl.hpp | 12 +-- .../profile_grouped_conv_bwd_weight_impl.hpp | 3 +- .../include/profile_grouped_conv_fwd_impl.hpp | 5 +- .../include/profile_grouped_gemm_impl.hpp | 12 +-- profiler/include/profile_groupnorm_impl.hpp | 3 +- profiler/include/profile_reduce_impl.hpp | 5 +- test/gemm/gemm_util.hpp | 19 ++-- test/gemm_split_k/gemm_split_k.cpp | 9 +- .../reference_conv_fwd/reference_conv_fwd.cpp | 18 ++-- 103 files changed, 657 insertions(+), 649 deletions(-) create mode 100644 library/include/ck/library/utility/algorithm.hpp create mode 100644 library/include/ck/library/utility/iterator.hpp create mode 100644 library/include/ck/library/utility/ranges.hpp diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp index 8ee98156e8..12a6992597 100644 --- a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp +++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp @@ -6,6 +6,8 @@ #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp" +#include "ck/library/utility/literals.hpp" + using F16 = ck::half_t; using F32 = float; @@ -135,15 +137,15 @@ int main(int argc, char* argv[]) auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -240,7 +242,7 @@ int main(int argc, char* argv[]) show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl; } #endif - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); } return 0; diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 4d3759eb9d..4e2cedb52a 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -131,11 +131,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) c_m_n_device_result = c_m_n_device_result_converted.CopyAsType(); - return ck::utils::check_err(c_m_n_device_result_converted.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result); #else c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); - return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); #endif } diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp index d1b8ca10a9..917b6b1c31 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp @@ -14,6 +14,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" @@ -177,15 +178,15 @@ int main(int argc, char* argv[]) auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -271,8 +272,7 @@ int main(int argc, char* argv[]) if(do_verification) { - Tensor c_m_n(HostTensorDescriptor( - std::vector{static_cast(M), static_cast(N)})); + Tensor c_m_n({M, N}); using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -275,7 +276,7 @@ int main(int argc, char* argv[]) } } - return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; + return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1; } return 0; diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc index 645e98dfbb..f3def33b56 100644 --- a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc +++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc @@ -124,7 +124,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC if(config.do_verification) { - Tensor c_m_n(HostTensorDescriptor{M, N}); + Tensor c_m_n({M, N}); auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); @@ -147,9 +147,9 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC #ifdef BUILD_INT4_EXAMPLE const Tensor e_m_n_device_result_converted(e_m_n_device_result); - return ck::utils::check_err(e_m_n_device_result_converted.mData, e_m_n_host_result.mData); + return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result); #else - return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData); + return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result); #endif } diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp index 1995cfa314..4c594ccdf8 100644 --- a/example/09_convnd_fwd/convnd_fwd_common.hpp +++ b/example/09_convnd_fwd/convnd_fwd_common.hpp @@ -10,6 +10,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -84,7 +85,7 @@ bool run_grouped_conv_fwd(bool do_verification, std::array input_left_pads{}; std::array input_right_pads{}; - auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); @@ -164,7 +165,7 @@ bool run_grouped_conv_fwd(bool do_verification, out_device_buf.FromDevice(out_device.mData.data()); return ck::utils::check_err( - out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); } return true; diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp index 642315fc6b..00e370f296 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp @@ -16,6 +16,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" @@ -140,9 +141,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size) { std::vector dimensions{problem_size.G_, problem_size.N_}; - std::copy(begin(problem_size.output_spatial_lengths_), - end(problem_size.output_spatial_lengths_), - std::back_inserter(dimensions)); + ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions)); return HostTensorDescriptor(dimensions); } @@ -158,10 +157,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor, assert(size(descriptor.GetStrides()) == size(strides)); std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides)); } - -template -auto copy(const Range& range, OutputIterator iter) - -> decltype(std::copy(std::begin(range), std::end(range), iter)) -{ - return std::copy(std::begin(range), std::end(range), iter); -} diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc index c93ee941c1..b3a3891781 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc @@ -120,10 +120,10 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides); unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides); - copy(problem_size.conv_filter_strides_, begin(conv_filter_strides)); - copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations)); - copy(problem_size.input_left_pads_, begin(input_left_pads)); - copy(problem_size.input_right_pads_, begin(input_right_pads)); + ck::ranges::copy(problem_size.conv_filter_strides_, begin(conv_filter_strides)); + ck::ranges::copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations)); + ck::ranges::copy(problem_size.input_left_pads_, begin(input_left_pads)); + ck::ranges::copy(problem_size.input_right_pads_, begin(input_right_pads)); // run Conv + Reduction on device auto conv = DeviceInstance{}; @@ -273,16 +273,13 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, conv_output_device_buf.FromDevice(conv_output_device.mData.data()); r0_device_buf.FromDevice(r0_device.mData.data()); - return ck::utils::check_err(conv_output_device.mData, - conv_output_host.mData, + return ck::utils::check_err(conv_output_device, + conv_output_host, "Error: incorrect results! (Matrix E)", 1e-5f, 1e-4f) && - ck::utils::check_err(r0_device.mData, - r0_host.mData, - "Error: incorrect results! (Matrix R0)", - 1e-5f, - 1e-4f); + ck::utils::check_err( + r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f); } return true; diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp index ad5537eb45..70f9240148 100644 --- a/example/12_reduce/reduce_blockwise_impl.hpp +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -324,12 +324,12 @@ int reduce_blockwise_impl(bool do_verification, #endif out_dev.FromDevice(out.mData.data()); - pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + pass = pass && ck::utils::check_err(out, out_ref); if(OutputIndex) { out_index_dev.FromDevice(out_indices.mData.data()); - pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + pass = pass && ck::utils::check_err(out_indices, out_indices_ref); }; }; diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index a5c24b13a2..e668d31a17 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -294,7 +294,7 @@ int main(int argc, char* argv[]) if(do_verify) { out_dev.FromDevice(out.mData.data()); - pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + pass = pass && ck::utils::check_err(out, out_ref); }; return (pass ? 0 : 1); diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp index 0a5355f337..d488612b55 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -225,7 +225,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, if(do_verification) { out_dev.FromDevice(out.mData.data()); - pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + pass = pass && ck::utils::check_err(out, out_ref); }; return (pass ? 0 : 1); diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp index ccb20aa1ea..b83cb6a96f 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" template ::value) { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}); } else if constexpr(ck::is_same::value) { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}); } }; @@ -267,14 +268,14 @@ bool pool_test(bool do_verification, out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); - pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); + pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host); if constexpr(OutputIndex) { out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); - pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, - out_indices_n_c_ho_wo_host.mData); + pass = pass && + ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host); }; } diff --git a/example/14_gemm_xdl_quantization/gemm_xdl_relu_quantization_int8.cpp b/example/14_gemm_xdl_quantization/gemm_xdl_relu_quantization_int8.cpp index d2c9e66d31..bb50a90804 100644 --- a/example/14_gemm_xdl_quantization/gemm_xdl_relu_quantization_int8.cpp +++ b/example/14_gemm_xdl_quantization/gemm_xdl_relu_quantization_int8.cpp @@ -15,6 +15,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" @@ -133,15 +134,15 @@ int main(int argc, char* argv[]) auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -225,7 +226,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); - return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; + return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result) ? 0 : 1; } return 0; diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp index 15d7d48fd2..05d572a1f5 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" template diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp index d1c265ccdd..3f78dafa89 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" template diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp index 78e2167eae..fd93bb5f87 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" template diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp index 2113cf9431..faf41bbf0b 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" template diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp index 0c35c1b6aa..7cb09778c5 100644 --- a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp +++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" template diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc index 01ba4ec045..324e177280 100644 --- a/example/15_grouped_gemm/run_grouped_gemm_example.inc +++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc @@ -52,15 +52,15 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -208,10 +208,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co #ifdef BUILD_INT4_EXAMPLE const Tensor c_device_result_converted(c_device_tensors[i]); - pass &= ck::utils::check_err(c_device_result_converted.mData, c_host_tensors[i].mData); + pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]); #else - pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData); + pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); #endif } } diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp index 6d57cef1ef..eb3832a668 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp @@ -15,6 +15,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" @@ -109,21 +110,20 @@ void DumpPerf(float ave_time, int M, int N, int K) } auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -259,12 +259,9 @@ int main() r0_device_buf.FromDevice(r0_m.mData.data()); r1_device_buf.FromDevice(r1_m.mData.data()); - pass = ck::utils::check_err( - e_m_n.mData, e_m_n_host.mData, "Error: Incorrect results c", 1e-2, 1e-2); - pass &= ck::utils::check_err( - r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2); - pass &= ck::utils::check_err( - r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2); + pass = ck::utils::check_err(e_m_n, e_m_n_host, "Error: Incorrect results c", 1e-2, 1e-2); + pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2); + pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2); } bool time_kernel = true; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp index f644440334..e1248002f7 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp @@ -262,15 +262,13 @@ bool run_gemm_reduce_add_addsquare_xdl(ck::index_t M, Tensor e_m_n_host_converted(e_m_n_host); pass = ck::utils::check_err( - e_m_n.mData, e_m_n_host_converted.mData, "Error: Incorrect results c", 1e-2, 1e-2); + e_m_n, e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2); r0_device_buf.FromDevice(r0_m.mData.data()); r1_device_buf.FromDevice(r1_m.mData.data()); - pass &= ck::utils::check_err( - r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2); - pass &= ck::utils::check_err( - r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2); + pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2); + pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2); if(pass) { diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp index 8ba6342c8d..62992de597 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp @@ -241,8 +241,8 @@ auto run_gemm_reduce_max_xdl(ck::index_t M, if constexpr(std::is_same_v) { Tensor e_m_n_device_converted(e_m_n); - pass = ck::utils::check_err(e_m_n_device_converted.mData, - e_m_n_host_converted.mData, + pass = ck::utils::check_err(e_m_n_device_converted, + e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2); @@ -251,12 +251,11 @@ auto run_gemm_reduce_max_xdl(ck::index_t M, #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 { pass = ck::utils::check_err( - e_m_n.mData, e_m_n_host_converted.mData, "Error: Incorrect results c", 1e-2, 1e-2); + e_m_n, e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2); } r0_device_buf.FromDevice(r0_m.mData.data()); - pass &= ck::utils::check_err( - r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2); + pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2); if(pass) { @@ -456,8 +455,8 @@ bool run_gemm_reduce_mean_meansquare_xdl(ck::index_t M, if constexpr(std::is_same_v) { Tensor e_m_n_device_converted(e_m_n); - pass = ck::utils::check_err(e_m_n_device_converted.mData, - e_m_n_host_converted.mData, + pass = ck::utils::check_err(e_m_n_device_converted, + e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2); @@ -466,16 +465,14 @@ bool run_gemm_reduce_mean_meansquare_xdl(ck::index_t M, #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 { pass = ck::utils::check_err( - e_m_n.mData, e_m_n_host_converted.mData, "Error: Incorrect results c", 1e-2, 1e-2); + e_m_n, e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2); } r0_device_buf.FromDevice(r0_m.mData.data()); r1_device_buf.FromDevice(r1_m.mData.data()); - pass &= ck::utils::check_err( - r0_m.mData, r0_m_host.mData, "Error: Incorrect results d0", 1e-2, 1e-2); - pass &= ck::utils::check_err( - r1_m.mData, r1_m_host.mData, "Error: Incorrect results d1", 1e-2, 1e-2); + pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2); + pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2); if(pass) { diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp index 061c6e9eb1..1e2c1832e7 100644 --- a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp +++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp @@ -142,7 +142,7 @@ int run_conv_bwd_data(bool do_verification, in_device_buf.FromDevice(in_device.mData.data()); - return ck::utils::check_err(in_device.mData, in_host.mData) ? 0 : 1; + return ck::utils::check_err(in_device, in_host) ? 0 : 1; } return 0; diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp index 3488a53363..c2e3602a7b 100644 --- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp +++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template @@ -132,15 +133,15 @@ int main(int argc, char* argv[]) std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({row * stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({col * stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride}); } }; @@ -149,17 +150,13 @@ int main(int argc, char* argv[]) Tensor c_g_m_n_host_result( f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); - Tensor d0_g_m_host_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); - Tensor d1_g_m_host_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); + Tensor d0_g_m_host_result({BatchCount, M}); + Tensor d1_g_m_host_result({BatchCount, M}); Tensor c_g_m_n_device_result( f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); - Tensor d0_g_m_device_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); - Tensor d1_g_m_device_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); + Tensor d0_g_m_device_result({BatchCount, M}); + Tensor d1_g_m_device_result({BatchCount, M}); std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl; std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; @@ -296,16 +293,15 @@ int main(int argc, char* argv[]) } } - pass = ck::utils::check_err(c_g_m_n_host_result.mData, - c_g_m_n_device_result.mData, - "Error: Incorrect results c") && - ck::utils::check_err(d0_g_m_device_result.mData, - d0_g_m_host_result.mData, + pass = ck::utils::check_err( + c_g_m_n_host_result, c_g_m_n_device_result, "Error: Incorrect results c") && + ck::utils::check_err(d0_g_m_device_result, + d0_g_m_host_result, "Error: Incorrect results! D0", 1e-4, 1e-5) && - ck::utils::check_err(d1_g_m_device_result.mData, - d1_g_m_host_result.mData, + ck::utils::check_err(d1_g_m_device_result, + d1_g_m_host_result, "Error: Incorrect results! D1", 1e-3, 1e-5); diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp index b84d320170..9eae27ca6e 100644 --- a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp +++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp @@ -12,6 +12,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" using F16 = ck::half_t; using F32 = float; @@ -71,13 +72,13 @@ int main() ck::index_t Stride = 1024; auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + using namespace ck::literals; + + return HostTensorDescriptor({row, col}, {stride, 1_uz}); }; Tensor a_m_n(f_host_tensor_descriptor2d(M, N, Stride)); @@ -128,8 +129,7 @@ int main() host_broadcast2D, Tensor, Tensor, Add, 0>( host_c_m_n, a_m_n, b_n, M, N, Add{}); - pass &= ck::utils::check_err( - c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results c", 1e-3, 1e-3); + pass &= ck::utils::check_err(c_m_n, host_c_m_n, "Error: Incorrect results c", 1e-3, 1e-3); } return pass ? 0 : 1; diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp index 041871bf57..813d38b01e 100644 --- a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp +++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp @@ -8,6 +8,7 @@ #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -82,11 +83,9 @@ int main() std::array b_strides; std::array c_strides; - std::copy(mnk.begin(), mnk.end(), abc_lengths.begin()); - std::copy( - b_m_n_k.mDesc.GetStrides().begin(), b_m_n_k.mDesc.GetStrides().end(), b_strides.begin()); - std::copy( - c_m_n_k.mDesc.GetStrides().begin(), c_m_n_k.mDesc.GetStrides().end(), c_strides.begin()); + ck::ranges::copy(mnk, abc_lengths.begin()); + ck::ranges::copy(b_m_n_k.mDesc.GetStrides(), b_strides.begin()); + ck::ranges::copy(c_m_n_k.mDesc.GetStrides(), c_strides.begin()); auto broadcastAdd = DeviceElementwiseAddInstance{}; auto argument = broadcastAdd.MakeArgumentPointer( @@ -113,8 +112,8 @@ int main() host_broadcast3D_am_bmnk, Tensor, Tensor, Add>( host_c_m_n_k, a_m, b_m_n_k, mnk, Add{}); - pass &= ck::utils::check_err( - c_m_n_k.mData, host_c_m_n_k.mData, "Error: Incorrect results c", 1e-3, 1e-3); + pass &= + ck::utils::check_err(c_m_n_k, host_c_m_n_k, "Error: Incorrect results c", 1e-3, 1e-3); } return pass ? 0 : 1; diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp index fb218d235f..a1ca9378d3 100644 --- a/example/19_binary_elementwise/elementwise_add_1d.cpp +++ b/example/19_binary_elementwise/elementwise_add_1d.cpp @@ -53,8 +53,7 @@ int main() ck::index_t M = 1024; auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; Tensor a_m(f_host_tensor_descriptor1d(M, 1)); @@ -105,8 +104,7 @@ int main() host_elementwise1D, Tensor, Tensor, Add>( host_c_m, a_m, b_m, M, Add{}); - pass &= ck::utils::check_err( - c_m.mData, host_c_m.mData, "Error: Incorrect results c", 1e-3, 1e-3); + pass &= ck::utils::check_err(c_m, host_c_m, "Error: Incorrect results c", 1e-3, 1e-3); } return pass ? 0 : 1; diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp index d4b9f90fa4..27e1001481 100644 --- a/example/19_binary_elementwise/elementwise_add_4d.cpp +++ b/example/19_binary_elementwise/elementwise_add_4d.cpp @@ -8,6 +8,7 @@ #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -82,10 +83,10 @@ int main() std::array b_strides; std::array c_strides; - std::copy(nchw.begin(), nchw.end(), abc_lengths.begin()); - std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin()); - std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin()); - std::copy(c.mDesc.GetStrides().begin(), c.mDesc.GetStrides().end(), c_strides.begin()); + ck::ranges::copy(nchw, abc_lengths.begin()); + ck::ranges::copy(a.mDesc.GetStrides(), a_strides.begin()); + ck::ranges::copy(b.mDesc.GetStrides(), b_strides.begin()); + ck::ranges::copy(c.mDesc.GetStrides(), c_strides.begin()); auto broadcastAdd = DeviceElementwiseAddInstance{}; auto argument = broadcastAdd.MakeArgumentPointer( @@ -112,8 +113,7 @@ int main() host_elementwise4D, Tensor, Tensor, Add>( host_c, a, b, nchw, Add{}); - pass &= - ck::utils::check_err(c.mData, host_c.mData, "Error: Incorrect results c", 1e-3, 1e-3); + pass &= ck::utils::check_err(c, host_c, "Error: Incorrect results c", 1e-3, 1e-3); } return pass ? 0 : 1; diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp index 8d9f87d7e5..e37555e761 100644 --- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" @@ -108,21 +109,20 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise< ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -372,8 +372,8 @@ int main() N); layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data()); - pass &= ck::utils::check_err(layerNorm_m_n.mData, - host_layerNorm_m_n.mData, + pass &= ck::utils::check_err(layerNorm_m_n, + host_layerNorm_m_n, "Error: Incorrect results layerNorm_m_n", 1e-2, 1e-2); diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp index 31231bc8ad..282c8763eb 100644 --- a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/check_err.hpp" @@ -107,21 +108,20 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise< ck::Sequence<8>>; // scalarPerVector: y(layerNorm_out) auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { - return HostTensorDescriptor(std::vector({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -346,11 +346,8 @@ int main() N); layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data()); - pass &= ck::utils::check_err(layerNorm_m_n.mData, - host_layerNorm_m_n.mData, - "Error: Incorrect results d1", - 1e-3, - 1e-3); + pass &= ck::utils::check_err( + layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3); } { diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp index 56d4472bc9..3c3e36be6a 100644 --- a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp +++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp @@ -10,6 +10,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" @@ -132,15 +133,15 @@ int main(int argc, char* argv[]) auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -149,10 +150,10 @@ int main(int argc, char* argv[]) Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor acc_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c0_n_bias(HostTensorDescriptor(std::vector({size_t(N)}))); + Tensor c0_n_bias({N}); Tensor c0_m_n_add(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor c0_n_gamma(HostTensorDescriptor(std::vector({size_t(N)}))); - Tensor c0_n_beta(HostTensorDescriptor(std::vector({size_t(N)}))); + Tensor c0_n_gamma({N}); + Tensor c0_n_beta({N}); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -274,15 +275,12 @@ int main(int argc, char* argv[]) if constexpr(std::is_same::value) { pass &= ck::utils::check_err( - c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c"); + c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results c"); } else if constexpr(std::is_same::value) { - pass &= ck::utils::check_err(c_m_n_device_result.mData, - c_m_n_host_result.mData, - "Error: Incorrect results c", - 1e-2, - 1e-2); + pass &= ck::utils::check_err( + c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results c", 1e-2, 1e-2); } } return pass ? 0 : 1; diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/example/22_cgemm/cgemm_xdl_common.hpp index f420ac24d5..6aa06b7c32 100644 --- a/example/22_cgemm/cgemm_xdl_common.hpp +++ b/example/22_cgemm/cgemm_xdl_common.hpp @@ -11,6 +11,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" template @@ -62,15 +63,15 @@ bool run_cgemm_xdl(ck::index_t M, auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -219,14 +220,14 @@ bool run_cgemm_xdl(ck::index_t M, const Tensor c_m_n_real_device_result_converted(c_m_n_real_device_result); const Tensor c_m_n_imag_device_result_converted(c_m_n_imag_device_result); - result = ck::utils::check_err(c_m_n_real_device_result_converted.mData, - c_m_n_real_host_result.mData, + result = ck::utils::check_err(c_m_n_real_device_result_converted, + c_m_n_real_host_result, "Verification error: incorrect results in real part!", 1e-2f, 1e-1f); result = result && ck::utils::check_err( - c_m_n_imag_device_result_converted.mData, - c_m_n_imag_host_result.mData, + c_m_n_imag_device_result_converted, + c_m_n_imag_host_result, "Verification error: incorrect results in imaginary part!", 1e-2f, 1e-1f); @@ -234,14 +235,14 @@ bool run_cgemm_xdl(ck::index_t M, else #endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 { - result = ck::utils::check_err(c_m_n_real_device_result.mData, - c_m_n_real_host_result.mData, + result = ck::utils::check_err(c_m_n_real_device_result, + c_m_n_real_host_result, "Verification error: incorrect results in real part!", 1e-2f, 1e-1f); result = result && ck::utils::check_err( - c_m_n_imag_device_result.mData, - c_m_n_imag_host_result.mData, + c_m_n_imag_device_result, + c_m_n_imag_host_result, "Verification error: incorrect results in imaginary part!", 1e-2f, 1e-1f); diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp index 7ab9221fff..8854bf047b 100644 --- a/example/23_softmax/softmax_blockwise.cpp +++ b/example/23_softmax/softmax_blockwise.cpp @@ -246,7 +246,7 @@ int main(int argc, char* argv[]) invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); out_dev.FromDevice(out.mData.data()); // LogRangeAsType(std::cout << "tensor out: " , out.mData, ",") << std::endl; - pass = pass && ck::utils::check_err(out.mData, out_ref.mData); + pass = pass && ck::utils::check_err(out, out_ref); }; float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel}); diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/example/24_batched_gemm/run_batched_gemm_example.inc index 20bef9f935..21934add31 100644 --- a/example/24_batched_gemm/run_batched_gemm_example.inc +++ b/example/24_batched_gemm/run_batched_gemm_example.inc @@ -55,15 +55,15 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count_, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count_, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -174,11 +174,11 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co #ifdef BUILD_INT4_EXAMPLE const Tensor e_device_result_converted(e_g_m_n_device_result); - pass &= ck::utils::check_err(e_device_result_converted.mData, e_g_m_n_host_result.mData); + pass &= ck::utils::check_err(e_device_result_converted, e_g_m_n_host_result); #else pass = ck::utils::check_err( - e_g_m_n_device_result.mData, e_g_m_n_host_result.mData, "Error: Incorrect results c"); + e_g_m_n_device_result, e_g_m_n_host_result, "Error: Incorrect results c"); #endif } diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp index 9cd34bfc1d..02eba871c7 100644 --- a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp +++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp @@ -246,21 +246,11 @@ int main(int argc, char* argv[]) exit(0); } - Tensor a_gs_ms_ks( - std::vector(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()), - std::vector(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end())); - Tensor b_gs_ns_ks( - std::vector(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()), - std::vector(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end())); - Tensor d_gs_ms_ns( - std::vector(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()), - std::vector(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end())); - Tensor e_gs_ms_ns_host_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); - Tensor e_gs_ms_ns_device_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + Tensor d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides); + Tensor e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + Tensor e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl; @@ -357,9 +347,7 @@ int main(int argc, char* argv[]) if(do_verification) { - Tensor c_gs_ms_ns_host_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); + Tensor c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1 a_gs_ms_ks( - std::vector(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()), - std::vector(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end())); - Tensor b_gs_ns_ks( - std::vector(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()), - std::vector(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end())); - Tensor d_gs_ms_ns( - std::vector(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()), - std::vector(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end())); - Tensor e_gs_ms_ns_host_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); - Tensor e_gs_ms_ns_device_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + Tensor d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides); + Tensor e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + Tensor e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl; @@ -357,9 +347,7 @@ int main(int argc, char* argv[]) if(do_verification) { - Tensor c_gs_ms_ns_host_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); + Tensor c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1 a_ms_ks( - std::vector(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()), - std::vector(a_ms_ks_strides.begin(), a_ms_ks_strides.end())); - Tensor b_ns_ks( - std::vector(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()), - std::vector(b_ns_ks_strides.begin(), b_ns_ks_strides.end())); - Tensor d_ms_ns( - std::vector(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()), - std::vector(d_ms_ns_strides.begin(), d_ms_ns_strides.end())); - Tensor e_ms_ns_host_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); - Tensor e_ms_ns_device_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides); + Tensor b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides); + Tensor d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides); + Tensor e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides); + Tensor e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides); std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl; std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl; @@ -398,9 +388,7 @@ int main(int argc, char* argv[]) if(do_verification) { - Tensor c_ms_ns_host_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides); using ReferenceOpInstance = ReferenceContraction_M2_N2_K2 a_ms_ks( - std::vector(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()), - std::vector(a_ms_ks_strides.begin(), a_ms_ks_strides.end())); - Tensor b_ns_ks( - std::vector(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()), - std::vector(b_ns_ks_strides.begin(), b_ns_ks_strides.end())); - Tensor e_ms_ns_host_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); - Tensor e_ms_ns_device_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides); + Tensor b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides); + Tensor e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides); + Tensor e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides); std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl; std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl; @@ -379,9 +371,7 @@ int main(int argc, char* argv[]) if(do_verification) { - Tensor c_ms_ns_host_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides); using ReferenceOpInstance = ReferenceContraction_M2_N2_K2({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + using namespace ck::literals; + + return HostTensorDescriptor({row, col}, {stride, 1_uz}); }; Tensor x(f_host_tensor_descriptor2d(M, N, Stride)); @@ -132,8 +133,7 @@ int main() ref_invoker.Run(ref_argument); y_dev.FromDevice(y.mData.data()); - pass &= - ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3); + pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3); } return (pass ? 0 : 1); } diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp index e1fa966a22..32a714824c 100644 --- a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp @@ -297,18 +297,10 @@ int main(int argc, char* argv[]) const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths; const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides; - Tensor a_ms_ks( - std::vector(a_ms_ks_lengths.begin(), a_ms_ks_lengths.end()), - std::vector(a_ms_ks_strides.begin(), a_ms_ks_strides.end())); - Tensor b_ns_ks( - std::vector(b_ns_ks_lengths.begin(), b_ns_ks_lengths.end()), - std::vector(b_ns_ks_strides.begin(), b_ns_ks_strides.end())); - Tensor d_ms_ns( - std::vector(d_ms_ns_lengths.begin(), d_ms_ns_lengths.end()), - std::vector(d_ms_ns_strides.begin(), d_ms_ns_strides.end())); - Tensor e_ms_ns_device_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides); + Tensor b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides); + Tensor d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides); + Tensor e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides); ck::index_t M_ = std::accumulate(e_ms_ns_lengths.begin(), e_ms_ns_lengths.begin() + NumDimM, @@ -423,13 +415,9 @@ int main(int argc, char* argv[]) const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths; const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides; - Tensor c_ms_ns_host_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides); - Tensor e_ms_ns_host_result( - std::vector(e_ms_ns_lengths.begin(), e_ms_ns_lengths.end()), - std::vector(e_ms_ns_strides.begin(), e_ms_ns_strides.end())); + Tensor e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides); e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data()); @@ -475,7 +463,7 @@ int main(int argc, char* argv[]) } } - pass &= ck::utils::check_err(e_device_tensors[i].mData, e_ms_ns_host_result.mData); + pass &= ck::utils::check_err(e_device_tensors[i], e_ms_ns_host_result); } } diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp index ef7f5b029b..b94fe8fd25 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp @@ -246,21 +246,11 @@ int main(int argc, char* argv[]) exit(0); } - Tensor a_gs_ms_ks( - std::vector(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()), - std::vector(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end())); - Tensor b_gs_ns_ks( - std::vector(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()), - std::vector(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end())); - Tensor d_gs_ms_ns( - std::vector(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()), - std::vector(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end())); - Tensor e_gs_ms_ns_host_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); - Tensor e_gs_ms_ns_device_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + Tensor d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides); + Tensor e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + Tensor e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl; @@ -362,9 +352,7 @@ int main(int argc, char* argv[]) if(do_verification) { - Tensor c_ms_ns_host_result( - std::vector(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()), - std::vector(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end())); + Tensor c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1 input_left_pads{}; std::array input_right_pads{}; - auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); @@ -257,10 +257,10 @@ bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config, const Tensor out_device_converted(out_device); return ck::utils::check_err( - out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + out_device_converted, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); #else return ck::utils::check_err( - out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); #endif } diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp index 3988950918..74e0e07e62 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp @@ -23,6 +23,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp index 2f0d4e686c..d5fadb8081 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp @@ -23,6 +23,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp index 6ad74889db..0dd4e0914f 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp @@ -23,6 +23,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp index 29faf13e13..1fd93622a1 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp @@ -27,6 +27,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp index 153257543f..15d98abab7 100644 --- a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp +++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp @@ -23,6 +23,7 @@ Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc index 931d2205c9..7e5f1614bc 100644 --- a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc +++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc @@ -106,15 +106,15 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[]) std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -270,7 +270,7 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[]) c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data()); #endif - return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData); + return ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result); } return true; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp index 644adf300e..0eb1565330 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -24,6 +24,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp index 3727be02d4..2ce91a8c60 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -24,6 +24,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp index 327875e28b..182eca0617 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp @@ -23,6 +23,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" @@ -245,15 +246,15 @@ int main(int argc, char* argv[]) std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -391,7 +392,7 @@ int main(int argc, char* argv[]) ref_gemm1_invoker.Run(ref_gemm1_argument); - return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1; + return ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result) ? 0 : 1; } return 0; diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp index 11d9927f70..38b5badc6e 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp @@ -24,6 +24,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 5a373d7a27..cdfbd6a64f 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -22,7 +22,7 @@ int run(int argc, char* argv[]) float alpha = 1; - bool input_permute = false; + bool input_permute = false; bool output_permute = true; if(argc == 1) @@ -50,7 +50,7 @@ int run(int argc, char* argv[]) alpha = std::stof(argv[10]); - input_permute = std::stoi(argv[11]); + input_permute = std::stoi(argv[11]); output_permute = std::stoi(argv[12]); } else diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc index 57782208de..ef2acf61f5 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc @@ -7,7 +7,7 @@ int run(int argc, char* argv[]) int init_method = 1; bool time_kernel = false; - bool input_permute = false; + bool input_permute = false; bool output_permute = true; if(argc == 1) @@ -26,7 +26,7 @@ int run(int argc, char* argv[]) init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); - input_permute = std::stoi(argv[4]); + input_permute = std::stoi(argv[4]); output_permute = std::stoi(argv[5]); } else @@ -66,10 +66,10 @@ int run(int argc, char* argv[]) std::cout << "group count " << group_count << ". printing first 4 groups\n"; for(std::size_t i = 0; i < group_count; i++) { - int M = 128 * (rand() % 8 + 1); - int N = 128 * (rand() % 8 + 1); - int K = 40; - int O = 40 * (rand() % 2 + 1); + int M = 128 * (rand() % 8 + 1); + int N = 128 * (rand() % 8 + 1); + int K = 40; + int O = 40 * (rand() % 2 + 1); int G0 = rand() % 3 + 1; int G1 = rand() % 5 + 1; @@ -228,12 +228,12 @@ int run(int argc, char* argv[]) { for(std::size_t i = 0; i < group_count; i++) { - const int& G0 = g0_g1_m_n_k_o[i][0]; - const int& G1 = g0_g1_m_n_k_o[i][1]; - const int& M = g0_g1_m_n_k_o[i][2]; - const int& N = g0_g1_m_n_k_o[i][3]; - const int& K = g0_g1_m_n_k_o[i][4]; - const int& O = g0_g1_m_n_k_o[i][5]; + const int& G0 = g0_g1_m_n_k_o[i][0]; + const int& G1 = g0_g1_m_n_k_o[i][1]; + const int& M = g0_g1_m_n_k_o[i][2]; + const int& N = g0_g1_m_n_k_o[i][3]; + const int& K = g0_g1_m_n_k_o[i][4]; + const int& O = g0_g1_m_n_k_o[i][5]; const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths; const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides; diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp index 9de98b71ce..376b95ea7b 100644 --- a/example/33_multiple_reduce/dual_reduce_common.hpp +++ b/example/33_multiple_reduce/dual_reduce_common.hpp @@ -12,6 +12,7 @@ #include "ck/utility/reduction_enums.hpp" #include "ck/utility/data_type.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -253,10 +254,10 @@ int mean_meansquare_dual_reduce_test(size_t n, std::array i_outLengths; std::array i_outStrides; - std::copy(inLengths.begin(), inLengths.end(), i_inLengths.begin()); - std::copy(inStrides.begin(), inStrides.end(), i_inStrides.begin()); - std::copy(outLengths.begin(), outLengths.end(), i_outLengths.begin()); - std::copy(outStrides.begin(), outStrides.end(), i_outStrides.begin()); + ck::ranges::copy(inLengths, i_inLengths.begin()); + ck::ranges::copy(inStrides, i_inStrides.begin()); + ck::ranges::copy(outLengths, i_outLengths.begin()); + ck::ranges::copy(outStrides, i_outStrides.begin()); auto dual_reduce_op = DeviceDualReduce{}; @@ -305,8 +306,8 @@ int mean_meansquare_dual_reduce_test(size_t n, { mean_dev.FromDevice(mean.mData.data()); meansquare_dev.FromDevice(meansquare.mData.data()); - pass = pass && ck::utils::check_err(mean.mData, mean_ref.mData); - pass = pass && ck::utils::check_err(meansquare.mData, meansquare_ref.mData); + pass = pass && ck::utils::check_err(mean, mean_ref); + pass = pass && ck::utils::check_err(meansquare, meansquare_ref); }; return (pass ? 0 : 1); diff --git a/example/34_batchnorm/batchnorm_forward_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_nhwc.cpp index 13e408cab8..03f24eeb67 100644 --- a/example/34_batchnorm/batchnorm_forward_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_forward_nhwc.cpp @@ -9,6 +9,7 @@ #include #include "ck/ck.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -263,14 +264,10 @@ bool bnorm_fwd_nhwc_test(bool do_verification, std::array i_scaleBiasMeanVarLengths; std::array i_scaleBiasMeanVarStrides; - std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin()); - std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin()); - std::copy(scaleBiasMeanVarLengths.begin(), - scaleBiasMeanVarLengths.end(), - i_scaleBiasMeanVarLengths.begin()); - std::copy(scaleBiasMeanVarStrides.begin(), - scaleBiasMeanVarStrides.end(), - i_scaleBiasMeanVarStrides.begin()); + ck::ranges::copy(inOutLengths, i_inOutLengths.begin()); + ck::ranges::copy(inOutStrides, i_inOutStrides.begin()); + ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin()); + ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin()); using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; @@ -413,7 +410,7 @@ bool bnorm_fwd_nhwc_test(bool do_verification, (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); y_dev.FromDevice(y.mData.data()); - pass = pass && ck::utils::check_err(y.mData, y_ref.mData); + pass = pass && ck::utils::check_err(y, y_ref); if(updateMovingAverage) { @@ -423,10 +420,8 @@ bool bnorm_fwd_nhwc_test(bool do_verification, resultRunningMean_dev.FromDevice(resultRunningMean.mData.data()); resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data()); - pass = - pass && ck::utils::check_err(resultRunningMean.mData, resultRunningMean_ref.mData); - pass = pass && ck::utils::check_err(resultRunningVariance.mData, - resultRunningVariance_ref.mData); + pass = pass && ck::utils::check_err(resultRunningMean, resultRunningMean_ref); + pass = pass && ck::utils::check_err(resultRunningVariance, resultRunningVariance_ref); }; if(saveMeanAndInvVariance) @@ -439,9 +434,8 @@ bool bnorm_fwd_nhwc_test(bool do_verification, resultSaveMean_dev.FromDevice(resultSaveMean.mData.data()); resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data()); - pass = pass && ck::utils::check_err(resultSaveMean.mData, resultSaveMean_ref.mData); - pass = pass && ck::utils::check_err(resultSaveInvVariance.mData, - resultSaveInvVariance_ref.mData); + pass = pass && ck::utils::check_err(resultSaveMean, resultSaveMean_ref); + pass = pass && ck::utils::check_err(resultSaveInvVariance, resultSaveInvVariance_ref); }; }; diff --git a/example/34_batchnorm/batchnorm_infer_nhwc.cpp b/example/34_batchnorm/batchnorm_infer_nhwc.cpp index d6c5dc1001..2dc9d6b789 100644 --- a/example/34_batchnorm/batchnorm_infer_nhwc.cpp +++ b/example/34_batchnorm/batchnorm_infer_nhwc.cpp @@ -9,6 +9,7 @@ #include #include "ck/ck.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -220,14 +221,10 @@ bool bnorm_infer_nhwc_test(bool do_verification, std::array i_scaleBiasMeanVarLengths; std::array i_scaleBiasMeanVarStrides; - std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin()); - std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin()); - std::copy(scaleBiasMeanVarLengths.begin(), - scaleBiasMeanVarLengths.end(), - i_scaleBiasMeanVarLengths.begin()); - std::copy(scaleBiasMeanVarStrides.begin(), - scaleBiasMeanVarStrides.end(), - i_scaleBiasMeanVarStrides.begin()); + ck::ranges::copy(inOutLengths, i_inOutLengths.begin()); + ck::ranges::copy(inOutStrides, i_inOutStrides.begin()); + ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin()); + ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin()); int result = 0; @@ -302,7 +299,7 @@ bool bnorm_infer_nhwc_test(bool do_verification, (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); y_dev.FromDevice(y.mData.data()); - pass = pass && ck::utils::check_err(y.mData, y_ref.mData); + pass = pass && ck::utils::check_err(y, y_ref); }; return (pass); diff --git a/example/35_splitK_gemm/run_splitK_gemm_example.inc b/example/35_splitK_gemm/run_splitK_gemm_example.inc index c78cb36a9a..e9bd5c552d 100644 --- a/example/35_splitK_gemm/run_splitK_gemm_example.inc +++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc @@ -34,15 +34,15 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -146,15 +146,12 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con if(std::is_same::value) { - pass &= ck::utils::check_err(c_m_n_device_result.mData, - c_m_n_host_result.mData, - "fp16 incorrect result", - 3e-3, - 1e-3); + pass &= ck::utils::check_err( + c_m_n_device_result, c_m_n_host_result, "fp16 incorrect result", 3e-3, 1e-3); } else { - pass &= ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + pass &= ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); } } diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp index 69d5c587e9..f5eb4c3b6b 100644 --- a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp +++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp @@ -86,12 +86,10 @@ int main() constexpr auto index_length = 2048; constexpr AccDataType epsilon = 1e-4; - auto f_host_tensor_desc_1d = [](std::size_t len_) { - return HostTensorDescriptor(std::vector({len_})); - }; + auto f_host_tensor_desc_1d = [](std::size_t len_) { return HostTensorDescriptor({len_}); }; auto f_host_tensor_desc_2d = [](std::size_t rows_, std::size_t cols_) { - return HostTensorDescriptor(std::vector({rows_, cols_})); + return HostTensorDescriptor({rows_, cols_}); }; using ReferenceInstance = @@ -203,8 +201,7 @@ int main() ref_invoker.Run(ref_argument); out_dev.FromDevice(out_from_dev.mData.data()); - pass &= ck::utils::check_err( - out_from_dev.mData, out.mData, "Error: Incorrect results", 1e-3, 1e-3); + pass &= ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3); } double total_read = current_dim * index_length * 3 * sizeof(EmbType) + diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp index e7efa04d23..071e8a7431 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp +++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp @@ -19,6 +19,7 @@ Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1 #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" template @@ -314,15 +315,15 @@ int main(int argc, char* argv[]) std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -511,8 +512,7 @@ int main(int argc, char* argv[]) cde1_element_op(e1_g_m_o_host_result(idx), c1_g_m_o(idx), d1_g_m_o(idx)); }); - return ck::utils::check_err(e1_g_m_o_device_result.mData, e1_g_m_o_host_result.mData) ? 0 - : 1; + return ck::utils::check_err(e1_g_m_o_device_result, e1_g_m_o_host_result) ? 0 : 1; } return 0; diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc index 880a3252c3..0afd8bd70d 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc +++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc @@ -61,7 +61,7 @@ bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config, std::array input_left_pads{}; std::array input_right_pads{}; - auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths); copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides); @@ -157,7 +157,7 @@ bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config, in_device_buf.FromDevice(in_device.mData.data()); - return ck::utils::check_err(in_device.mData, in_host.mData); + return ck::utils::check_err(in_device, in_host); } return true; diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp index 1c26f3d9a6..ab612cea17 100644 --- a/example/39_permute/common.hpp +++ b/example/39_permute/common.hpp @@ -19,6 +19,7 @@ #include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp" #include "ck/utility/type.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/fill.hpp" @@ -247,19 +248,6 @@ inline auto to_array(Range& range) noexcept return detail::to_array_proxy>{range}; } -namespace ranges { -template -inline auto copy(InputRange&& range, OutputIterator iter) - -> decltype(std::copy(std::begin(std::forward(range)), - std::end(std::forward(range)), - iter)) -{ - return std::copy(std::begin(std::forward(range)), - std::end(std::forward(range)), - iter); -} -} // namespace ranges - template inline auto is_valid_axes(const Axes& axes) -> std::enable_if_t, bool> @@ -350,7 +338,7 @@ auto extend_shape(const Problem::Shape& shape, std::size_t new_dim) using std::begin, std::end; - std::copy(begin(shape), end(shape), begin(extended_shape)); + ck::ranges::copy(shape, begin(extended_shape)); extended_shape.back() = new_dim; return extended_shape; @@ -362,7 +350,7 @@ auto extend_axes(const Problem::Axes& axes) using std::begin, std::end; - std::copy(begin(axes), end(axes), begin(extended_axes)); + ck::ranges::copy(axes, begin(extended_axes)); extended_axes.back() = detail::get_array_size_v; return extended_axes; diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc index ae23257022..70406d63f9 100644 --- a/example/39_permute/run_permute_bundle_example.inc +++ b/example/39_permute/run_permute_bundle_example.inc @@ -57,7 +57,7 @@ bool run_permute_bundle(const Problem& problem) using std::begin; Tensor input_tensor(input_shape); - ranges::copy(input_bundle_tensor.AsSpan(), begin(input_tensor)); + ck::ranges::copy(input_bundle_tensor.AsSpan(), begin(input_tensor)); Tensor output_tensor(transpose(input_shape, input_axes)); if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor)) diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp index 205916ff41..2aea08c400 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp @@ -11,6 +11,7 @@ #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp index 3bfa4c50e5..b7f80e76d6 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp @@ -11,6 +11,7 @@ #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp index ab0ddf075b..15e460948e 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp @@ -11,6 +11,7 @@ #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp index 7a46285c50..2cc4c07c0d 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp @@ -15,6 +15,7 @@ #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp index 62287ea60c..40ff0f69cc 100644 --- a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp +++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp @@ -11,6 +11,7 @@ #include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" diff --git a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc index f714ed98f4..104397928d 100644 --- a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc +++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc @@ -97,7 +97,7 @@ bool run_grouped_conv_conv_fwd(bool do_verification, std::array input1_left_pads{}; std::array input1_right_pads{}; - auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; copy(in0_g_n_c_wis_desc.GetLengths(), a0_g_n_c_wis_lengths); copy(in0_g_n_c_wis_desc.GetStrides(), a0_g_n_c_wis_strides); @@ -261,7 +261,7 @@ bool run_grouped_conv_conv_fwd(bool do_verification, #endif return ck::utils::check_err( - out1_device.mData, out1_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); + out1_device, out1_host, "Error: incorrect results!", 1e-5f, 1e-4f); } return true; diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp index d8a8a27c97..e62001d669 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp @@ -167,7 +167,7 @@ int main(int argc, char* argv[]) ref_invoker.Run(ref_argument); y_dev.FromDevice(y.mData.data()); - pass &= ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); + pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); } return (pass ? 0 : 1); diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp index b1e72459fd..28132aa1eb 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp @@ -44,8 +44,8 @@ struct ReferenceGemmLayernorm : public device::BaseOperator size_t M = acc.mDesc.GetLengths()[0]; size_t N = acc.mDesc.GetLengths()[1]; - Tensor avg_acc_sq(HostTensorDescriptor(std::vector({M}))); - Tensor avg_acc(HostTensorDescriptor(std::vector({M}))); + Tensor avg_acc_sq({M}); + Tensor avg_acc({M}); Tensor acc_layernorm(acc); // reduce N dim diff --git a/library/include/ck/library/utility/algorithm.hpp b/library/include/ck/library/utility/algorithm.hpp new file mode 100644 index 0000000000..86f04dd362 --- /dev/null +++ b/library/include/ck/library/utility/algorithm.hpp @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +namespace ck { +namespace ranges { +template +auto copy(InputRange&& range, OutputIterator iter) + -> decltype(std::copy(std::begin(std::forward(range)), + std::end(std::forward(range)), + iter)) +{ + return std::copy(std::begin(std::forward(range)), + std::end(std::forward(range)), + iter); +} + +template +auto fill(OutputRange&& range, const T& init) + -> std::void_t(range)), + std::end(std::forward(range)), + init))> +{ + std::fill(std::begin(std::forward(range)), + std::end(std::forward(range)), + init); +} + +template +auto transform(InputRange&& range, OutputIterator iter, UnaryOperation unary_op) + -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op)) +{ + return std::transform(std::begin(range), std::end(range), iter, unary_op); +} + +} // namespace ranges +} // namespace ck diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 3a5cd1da76..a89d03d324 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -15,18 +15,22 @@ #include "ck/ck.hpp" #include "ck/utility/data_type.hpp" -#include "ck/utility/span.hpp" #include "ck/utility/type.hpp" #include "ck/host_utility/io.hpp" +#include "ck/library/utility/ranges.hpp" + namespace ck { namespace utils { -template -typename std::enable_if::value && !std::is_same::value, - bool>::type -check_err(const std::vector& out, - const std::vector& ref, +template +typename std::enable_if< + std::is_same_v, ranges::range_value_t> && + std::is_floating_point_v> && + !std::is_same_v, half_t>, + bool>::type +check_err(const Range& out, + const RefRange& ref, const std::string& msg = "Error: Incorrect results!", double rtol = 1e-5, double atol = 3e-6) @@ -44,15 +48,17 @@ check_err(const std::vector& out, double max_err = std::numeric_limits::min(); for(std::size_t i = 0; i < ref.size(); ++i) { - err = std::abs(out[i] - ref[i]); - if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i])) + const double o = *std::next(std::begin(out), i); + const double r = *std::next(std::begin(ref), i); + err = std::abs(o - r); + if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { max_err = err > max_err ? err : max_err; err_count++; if(err_count < 5) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i - << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl; + << "] != ref[" << i << "]: " << o << " != " << r << std::endl; } res = false; } @@ -64,10 +70,13 @@ check_err(const std::vector& out, return res; } -template -typename std::enable_if::value, bool>::type -check_err(const std::vector& out, - const std::vector& ref, +template +typename std::enable_if< + std::is_same_v, ranges::range_value_t> && + std::is_same_v, bhalf_t>, + bool>::type +check_err(const Range& out, + const RefRange& ref, const std::string& msg = "Error: Incorrect results!", double rtol = 1e-3, double atol = 1e-3) @@ -86,9 +95,9 @@ check_err(const std::vector& out, double max_err = std::numeric_limits::min(); for(std::size_t i = 0; i < ref.size(); ++i) { - double o = type_convert(out[i]); - double r = type_convert(ref[i]); - err = std::abs(o - r); + const double o = type_convert(*std::next(std::begin(out), i)); + const double r = type_convert(*std::next(std::begin(ref), i)); + err = std::abs(o - r); if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { max_err = err > max_err ? err : max_err; @@ -108,10 +117,13 @@ check_err(const std::vector& out, return res; } -template -typename std::enable_if, bool>::type -check_err(span out, - span ref, +template +typename std::enable_if< + std::is_same_v, ranges::range_value_t> && + std::is_same_v, half_t>, + bool>::type +check_err(const Range& out, + const RefRange& ref, const std::string& msg = "Error: Incorrect results!", double rtol = 1e-3, double atol = 1e-3) @@ -126,12 +138,12 @@ check_err(span out, bool res{true}; int err_count = 0; double err = 0; - double max_err = std::numeric_limits::min(); + double max_err = std::numeric_limits>::min(); for(std::size_t i = 0; i < ref.size(); ++i) { - double o = type_convert(out[i]); - double r = type_convert(ref[i]); - err = std::abs(o - r); + const double o = type_convert(*std::next(std::begin(out), i)); + const double r = type_convert(*std::next(std::begin(ref), i)); + err = std::abs(o - r); if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r)) { max_err = err > max_err ? err : max_err; @@ -151,26 +163,17 @@ check_err(span out, return res; } -template -typename std::enable_if::value, bool>::type -check_err(const std::vector& out, - const std::vector& ref, - const std::string& msg = "Error: Incorrect results!", - double rtol = 1e-3, - double atol = 1e-3) -{ - return check_err(span{out}, span{ref}, msg, rtol, atol); -} - -template -std::enable_if_t<(std::is_integral_v && !std::is_same_v) +template +std::enable_if_t<(std::is_same_v, ranges::range_value_t> && + std::is_integral_v> && + !std::is_same_v, bhalf_t>) #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 - || std::is_same_v + || std::is_same_v, int4_t> #endif , bool> -check_err(const std::vector& out, - const std::vector& ref, +check_err(const Range& out, + const RefRange& ref, const std::string& msg = "Error: Incorrect results!", double = 0, double atol = 0) @@ -188,9 +191,9 @@ check_err(const std::vector& out, int64_t max_err = std::numeric_limits::min(); for(std::size_t i = 0; i < ref.size(); ++i) { - int64_t o = out[i]; - int64_t r = ref[i]; - err = std::abs(o - r); + const int64_t o = *std::next(std::begin(out), i); + const int64_t r = *std::next(std::begin(ref), i); + err = std::abs(o - r); if(err > atol) { diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp index 5ca34266a1..a8c7fd0395 100644 --- a/library/include/ck/library/utility/host_tensor.hpp +++ b/library/include/ck/library/utility/host_tensor.hpp @@ -14,6 +14,9 @@ #include "ck/utility/data_type.hpp" #include "ck/utility/span.hpp" +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/ranges.hpp" + template std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) { @@ -84,10 +87,10 @@ struct HostTensorDescriptor this->CalculateStrides(); } - template ())), std::size_t>>> - HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end()) + std::is_convertible_v, std::size_t>>> + HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end()) { this->CalculateStrides(); } @@ -102,13 +105,12 @@ struct HostTensorDescriptor { } - template < - typename Range1, - typename Range2, - typename = std::enable_if_t< - std::is_convertible_v())), std::size_t> && - std::is_convertible_v())), std::size_t>>> - HostTensorDescriptor(const Range1& lens, const Range2& strides) + template , std::size_t> && + std::is_convertible_v, std::size_t>>> + HostTensorDescriptor(const Lengths& lens, const Strides& strides) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()) { } @@ -244,14 +246,20 @@ struct Tensor { } - template - Tensor(std::vector lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize()) + template + Tensor(std::initializer_list lens, std::initializer_list strides) + : mDesc(lens, strides), mData(mDesc.GetElementSpaceSize()) { } - template - Tensor(std::vector lens, std::vector strides) - : mDesc(lens, strides), mData(mDesc.GetElementSpaceSize()) + template + Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize()) + { + } + + template + Tensor(const Lengths& lens, const Strides& strides) + : mDesc(lens, strides), mData(GetElementSpaceSize()) { } @@ -261,10 +269,10 @@ struct Tensor Tensor CopyAsType() const { Tensor ret(mDesc); - for(size_t i = 0; i < mData.size(); i++) - { - ret.mData[i] = ck::type_convert(mData[i]); - } + + ck::ranges::transform( + mData, ret.mData.begin(), [](auto value) { return ck::type_convert(value); }); + return ret; } @@ -294,13 +302,7 @@ struct Tensor std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); } - void SetZero() - { - for(auto& v : mData) - { - v = T{0}; - } - } + void SetZero() { ck::ranges::fill(mData, 0); } template void ForEach_impl(F&& f, std::vector& idx, size_t rank) diff --git a/library/include/ck/library/utility/iterator.hpp b/library/include/ck/library/utility/iterator.hpp new file mode 100644 index 0000000000..9fdc88ea76 --- /dev/null +++ b/library/include/ck/library/utility/iterator.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/type.hpp" + +namespace ck { + +template +using iter_value_t = typename std::iterator_traits>::value_type; + +template +using iter_reference_t = decltype(*std::declval()); + +template +using iter_difference_t = typename std::iterator_traits>::difference_type; + +} // namespace ck diff --git a/library/include/ck/library/utility/ranges.hpp b/library/include/ck/library/utility/ranges.hpp new file mode 100644 index 0000000000..55c322f1ac --- /dev/null +++ b/library/include/ck/library/utility/ranges.hpp @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/library/utility/iterator.hpp" + +namespace ck { +namespace ranges { + +template +using iterator_t = decltype(std::begin(std::declval())); + +template +using sentinel_t = decltype(std::end(std::declval())); + +template +using range_size_t = decltype(std::size(std::declval())); + +template +using range_difference_t = ck::iter_difference_t>; + +template +using range_value_t = iter_value_t>; + +template +using range_reference_t = iter_reference_t>; + +template +struct is_range : std::false_type +{ +}; + +template +struct is_range< + T, + std::void_t())), decltype(std::end(std::declval()))>> + : std::true_type +{ +}; + +template +inline constexpr bool is_range_v = is_range::value; + +template +struct is_sized_range : std::false_type +{ +}; + +template +struct is_sized_range()))>> + : std::bool_constant> +{ +}; +} // namespace ranges +} // namespace ck diff --git a/profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp index 3fa274c3ae..b16254279c 100644 --- a/profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp +++ b/profiler/include/profile_batched_gemm_add_relu_gemm_add_impl.hpp @@ -14,6 +14,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" namespace ck { @@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, { e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data()); - pass = pass & ck::utils::check_err(e1_g_m_o_device_result.mData, - e1_g_m_o_host_result.mData); + pass = pass & ck::utils::check_err(e1_g_m_o_device_result, e1_g_m_o_host_result); if(do_log) { diff --git a/profiler/include/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profile_batched_gemm_gemm_impl.hpp index d31daf7bc9..1583c6db21 100644 --- a/profiler/include/profile_batched_gemm_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_gemm_impl.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" namespace ck { @@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, { c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data()); - pass = pass & - ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData); + pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result); if(do_log) { diff --git a/profiler/include/profile_batched_gemm_impl.hpp b/profiler/include/profile_batched_gemm_impl.hpp index 3d9df4c81f..c07d7c0555 100644 --- a/profiler/include/profile_batched_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_impl.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" namespace ck { @@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification, std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification, { c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); - pass = pass & - ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData); + pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result); if(do_log) { diff --git a/profiler/include/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profile_batched_gemm_reduce_impl.hpp index 9807e020f5..45b7b77388 100644 --- a/profiler/include/profile_batched_gemm_reduce_impl.hpp +++ b/profiler/include/profile_batched_gemm_reduce_impl.hpp @@ -14,6 +14,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" namespace ck { @@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({row * stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({col * stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride}); } }; @@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification, Tensor c_g_m_n_host_result( f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); - Tensor d0_g_m_host_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); - Tensor d1_g_m_host_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); + Tensor d0_g_m_host_result({BatchCount, M}); + Tensor d1_g_m_host_result({BatchCount, M}); Tensor c_g_m_n_device_result( f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); - Tensor d0_g_m_device_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); - Tensor d1_g_m_device_result(HostTensorDescriptor(std::vector( - {static_cast(BatchCount), static_cast(M)}))); + Tensor d0_g_m_device_result({BatchCount, M}); + Tensor d1_g_m_device_result({BatchCount, M}); std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl; std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; @@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification, reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data()); reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data()); - bool c_error = - ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData); - bool d0_error = - ck::utils::check_err(d0_g_m_device_result.mData, d0_g_m_host_result.mData); - bool d1_error = - ck::utils::check_err(d1_g_m_device_result.mData, d1_g_m_host_result.mData); + bool c_error = ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result); + bool d0_error = ck::utils::check_err(d0_g_m_device_result, d0_g_m_host_result); + bool d1_error = ck::utils::check_err(d1_g_m_device_result, d1_g_m_host_result); pass = pass && (c_error == true); pass = pass && (d0_error == true); diff --git a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp index 6b0a25aca2..fe76fcaf0b 100644 --- a/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp +++ b/profiler/include/profile_batched_gemm_softmax_gemm_impl.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" @@ -113,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, std::size_t stride, std::size_t batch_stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, stride, 1})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({batch_count, row, col}), - std::vector({batch_stride, 1, stride})); + return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride}); } }; @@ -307,8 +308,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, { c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data()); - pass = pass & - ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData); + pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result); if(do_log) { diff --git a/profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp index 5533a88d54..0da5d05dc4 100644 --- a/profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp +++ b/profiler/include/profile_batched_gemm_softmax_gemm_permute_impl.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" @@ -308,8 +309,8 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification, { c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); - pass = pass & ck::utils::check_err(c_gs_ms_os_device_result.mData, - c_gs_ms_os_host_result.mData); + pass = + pass & ck::utils::check_err(c_gs_ms_os_device_result, c_gs_ms_os_host_result); if(do_log) { diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp index b0243e1b25..86d394daf9 100644 --- a/profiler/include/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profile_conv_bwd_data_impl.hpp @@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification, { in_device_buf.FromDevice(input_device_result.mData.data()); - pass = - pass & ck::utils::check_err(input_device_result.mData, input_host_result.mData); + pass = pass & ck::utils::check_err(input_device_result, input_host_result); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp index aad48946c8..1aebef8bb2 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_add_impl.hpp @@ -12,6 +12,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp" namespace ck { @@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, auto f_host_tensor_descriptor = [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { + using namespace ck::literals; + if constexpr(is_same::value || is_same::value || is_same::value) { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}); } else if constexpr(is_same::value || is_same::value || is_same::value) { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}); } }; @@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); // bias: assume contiguous 1d vector - Tensor bias_k( - HostTensorDescriptor(std::vector({static_cast(K)}))); + Tensor bias_k({K}); // residual: assume same layout as output tensor Tensor resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); @@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, { out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - ck::utils::check_err(out_n_k_ho_wo_device_result.mData, - out_n_k_ho_wo_host_result.mData); + ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp index f546606d67..2bac144334 100644 --- a/profiler/include/profile_conv_fwd_bias_relu_impl.hpp +++ b/profiler/include/profile_conv_fwd_bias_relu_impl.hpp @@ -12,6 +12,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp" namespace ck { @@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, auto f_host_tensor_descriptor = [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { + using namespace ck::literals; + if constexpr(is_same::value || is_same::value || is_same::value) { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz}); } else if constexpr(is_same::value || is_same::value || is_same::value) { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); + return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_}); } }; @@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); // bias: assume contiguous 1d vector - Tensor bias_k( - HostTensorDescriptor(std::vector({static_cast(K)}))); + Tensor bias_k({K}); std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; @@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, { out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - ck::utils::check_err(out_n_k_ho_wo_device_result.mData, - out_n_k_ho_wo_host_result.mData); + ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result); if(do_log) { diff --git a/profiler/include/profile_conv_fwd_impl.hpp b/profiler/include/profile_conv_fwd_impl.hpp index 4a91fede02..1f3ba8f007 100644 --- a/profiler/include/profile_conv_fwd_impl.hpp +++ b/profiler/include/profile_conv_fwd_impl.hpp @@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification, { out_device_buf.FromDevice(device_output.mData.data()); - pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); + pass = pass & ck::utils::check_err(device_output, host_output); if(do_log) { diff --git a/profiler/include/profile_convnd_bwd_data_impl.hpp b/profiler/include/profile_convnd_bwd_data_impl.hpp index cf9ae8dff1..1e69ebc8bd 100644 --- a/profiler/include/profile_convnd_bwd_data_impl.hpp +++ b/profiler/include/profile_convnd_bwd_data_impl.hpp @@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; } - success = ck::utils::check_err(input_host_result.mData, input_device_result.mData); + success = ck::utils::check_err(input_host_result, input_device_result); if(do_log) { diff --git a/profiler/include/profile_convnd_bwd_weight_impl.hpp b/profiler/include/profile_convnd_bwd_weight_impl.hpp index 8a6897a994..e37c887a96 100644 --- a/profiler/include/profile_convnd_bwd_weight_impl.hpp +++ b/profiler/include/profile_convnd_bwd_weight_impl.hpp @@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification, { wei_device_buf.FromDevice(weights_device_result.mData.data()); - success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData); + success = ck::utils::check_err(weights_host_result, weights_device_result); if(success == false) { diff --git a/profiler/include/profile_elementwise_layernorm_impl.hpp b/profiler/include/profile_elementwise_layernorm_impl.hpp index f5135005f2..7707e16b08 100644 --- a/profiler/include/profile_elementwise_layernorm_impl.hpp +++ b/profiler/include/profile_elementwise_layernorm_impl.hpp @@ -13,6 +13,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp" namespace ck { @@ -68,8 +69,9 @@ bool profile_elementwise_layernorm_impl(int do_verification, std::vector gammaBetaStride = {0, 1}; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + using namespace ck::literals; + + return HostTensorDescriptor({row, col}, {stride, 1_uz}); }; Tensor a(length); diff --git a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp index d4d37adae5..3cc2ea3b92 100644 --- a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp +++ b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, { auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, // run reference if(do_verification) { - Tensor c_m_n(HostTensorDescriptor( - std::vector{static_cast(M), static_cast(N)})); + Tensor c_m_n({M, N}); using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm({len}), - std::vector({stride})); + return HostTensorDescriptor({len}, {stride}); }; auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, Tensor c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor bias_n(f_host_tensor_descriptor1d(N, 1)); Tensor d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); - Tensor reduce0_m_host_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); - Tensor reduce1_m_host_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); + Tensor reduce0_m_host_result({M}); + Tensor reduce1_m_host_result({M}); Tensor c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); - Tensor reduce0_m_device_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); - Tensor reduce1_m_device_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); + Tensor reduce0_m_device_result({M}); + Tensor reduce1_m_device_result({M}); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); - ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); - ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); + ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); + ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result); + ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result); if(do_log) { diff --git a/profiler/include/profile_gemm_bilinear_impl.hpp b/profiler/include/profile_gemm_bilinear_impl.hpp index 17d0553db8..31bae281c4 100644 --- a/profiler/include/profile_gemm_bilinear_impl.hpp +++ b/profiler/include/profile_gemm_bilinear_impl.hpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification, { auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification, // run reference if(do_verification) { - Tensor c_m_n(HostTensorDescriptor( - std::vector{static_cast(M), static_cast(N)})); + Tensor c_m_n({M, N}); using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification, { c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - pass = - pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); if(do_log) { diff --git a/profiler/include/profile_gemm_reduce_impl.hpp b/profiler/include/profile_gemm_reduce_impl.hpp index fd4db3bce4..370121a3cc 100644 --- a/profiler/include/profile_gemm_reduce_impl.hpp +++ b/profiler/include/profile_gemm_reduce_impl.hpp @@ -14,6 +14,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification, auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification, Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor reduce0_m_host_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); - Tensor reduce1_m_host_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); + Tensor reduce0_m_host_result({M}); + Tensor reduce1_m_host_result({M}); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); - Tensor reduce0_m_device_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); - Tensor reduce1_m_device_result( - HostTensorDescriptor(std::vector({static_cast(M)}))); + Tensor reduce0_m_device_result({M}); + Tensor reduce1_m_device_result({M}); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification, reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); - ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); - ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); - ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); + ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); + ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result); + ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result); if(do_log) { diff --git a/profiler/include/profile_gemm_splitk_impl.hpp b/profiler/include/profile_gemm_splitk_impl.hpp index ba6ceb7514..e5d5f8765e 100644 --- a/profiler/include/profile_gemm_splitk_impl.hpp +++ b/profiler/include/profile_gemm_splitk_impl.hpp @@ -18,6 +18,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification, auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification, { c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - pass = - pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); if(do_log) { diff --git a/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp index d697a9400a..4f9aa98376 100644 --- a/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profile_grouped_conv_bwd_weight_impl.hpp @@ -209,8 +209,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, { wei_device_buf.FromDevice(weight_device_result.mData.data()); - bool pass = - ck::utils::check_err(weight_device_result.mData, weight_host_result.mData); + bool pass = ck::utils::check_err(weight_device_result, weight_host_result); if(!pass) { diff --git a/profiler/include/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profile_grouped_conv_fwd_impl.hpp index e0ed15f687..103116461d 100644 --- a/profiler/include/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp @@ -14,6 +14,7 @@ #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, std::array input_left_pads{}; std::array input_right_pads{}; - auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); @@ -179,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, { out_device_buf.FromDevice(device_output.mData.data()); - pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); + pass = pass & ck::utils::check_err(device_output, host_output); if(do_log) { diff --git a/profiler/include/profile_grouped_gemm_impl.hpp b/profiler/include/profile_grouped_gemm_impl.hpp index 4853fc98f2..04f94a0f24 100644 --- a/profiler/include/profile_grouped_gemm_impl.hpp +++ b/profiler/include/profile_grouped_gemm_impl.hpp @@ -17,6 +17,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification, auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification, c_element_op); ref_invoker.Run(ref_argument); - pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData, - c_m_n_host_result.mData); + pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result); if(do_log) { diff --git a/profiler/include/profile_groupnorm_impl.hpp b/profiler/include/profile_groupnorm_impl.hpp index 9b2a3e9f3f..81fec5590a 100644 --- a/profiler/include/profile_groupnorm_impl.hpp +++ b/profiler/include/profile_groupnorm_impl.hpp @@ -165,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification, { y_dev.FromDevice(y.mData.data()); - bool pass = - ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3); + bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3); if(do_log) { diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 981962bdc5..354e6e46fa 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -411,13 +411,12 @@ bool profile_reduce_impl_impl(bool do_verification, bool single_pass; out_dev.FromDevice(out.mData.data()); - single_pass = ck::utils::check_err(out.mData, out_ref.mData); + single_pass = ck::utils::check_err(out, out_ref); if(OutputIndex) { out_indices_dev.FromDevice(out_indices.mData.data()); - single_pass = single_pass && - ck::utils::check_err(out_indices.mData, out_indices_ref.mData); + single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref); }; if(!single_pass) diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp index 6291215b35..9057c0af89 100644 --- a/test/gemm/gemm_util.hpp +++ b/test/gemm/gemm_util.hpp @@ -9,6 +9,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" namespace ck { @@ -128,15 +129,15 @@ struct TestGemm { auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + if(std::is_same::value) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; @@ -229,27 +230,27 @@ struct TestGemm bool res = false; if(std::is_same::value) { - res = ck::utils::check_err(c_device.mData, c_host.mData); + res = ck::utils::check_err(c_device, c_host); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } else if(std::is_same::value) { - res = ck::utils::check_err(c_device.mData, c_host.mData); + res = ck::utils::check_err(c_device, c_host); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } else if(std::is_same::value) { - res = ck::utils::check_err(c_device.mData, c_host.mData); + res = ck::utils::check_err(c_device, c_host); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } else if(std::is_same::value) { - res = ck::utils::check_err(c_device.mData, c_host.mData); + res = ck::utils::check_err(c_device, c_host); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } else if(std::is_same::value) { - res = ck::utils::check_err(c_device.mData, c_host.mData); + res = ck::utils::check_err(c_device, c_host); std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; } diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp index 0a4cc2311f..d5cb03d613 100644 --- a/test/gemm_split_k/gemm_split_k.cpp +++ b/test/gemm_split_k/gemm_split_k.cpp @@ -16,6 +16,7 @@ #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/host_gemm.hpp" @@ -93,15 +94,15 @@ int test_gemm(const gemmArgs& args) auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) { + using namespace ck::literals; + if(row_major) { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({stride, 1})); + return HostTensorDescriptor({row, col}, {stride, 1_uz}); } else { - return HostTensorDescriptor(std::vector({row, col}), - std::vector({1, stride})); + return HostTensorDescriptor({row, col}, {1_uz, stride}); } }; diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp index 82a8dbbd06..1f9ba0064c 100644 --- a/test/reference_conv_fwd/reference_conv_fwd.cpp +++ b/test/reference_conv_fwd/reference_conv_fwd.cpp @@ -12,6 +12,7 @@ #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/fill.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -54,7 +55,7 @@ run_reference_convolution_forward(const ck::utils::conv::ConvParam& conv_param, fill_input_op(input.begin(), input.end()); fill_weights_op(weights.begin(), weights.end()); - std::fill(host_output.begin(), host_output.end(), OutDataType(0.f)); + ck::ranges::fill(host_output, 0.f); auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd ref_data{7.5, 13.5, 19.5, 25.5}; EXPECT_TRUE(ck::utils::check_err( out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!")); - EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!")); + EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!")); } TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding) @@ -207,7 +208,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding) std::vector ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5}; EXPECT_TRUE(ck::utils::check_err( out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!")); - EXPECT_TRUE(ck::utils::check_err(out_tensor.mData, ref_data, "Error: incorrect results!")); + EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!")); } TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize) @@ -301,7 +302,7 @@ TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize) 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4}; EXPECT_TRUE(ck::utils::check_err( out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!")); - EXPECT_TRUE(ck::utils::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!")); + EXPECT_TRUE(ck::utils::check_err(out_tensor2, ref_data, "Error: incorrect results!")); } #endif @@ -340,8 +341,7 @@ TEST(ReferenceConvolutionFWD, Conv3DGNCDHW) EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(), ref_dims, "Error [case 1]: wrong output tensor dimensions!")); - EXPECT_TRUE( - ck::utils::check_err(out_tensor.mData, ref_data, "Error [case 1]: incorrect results!")); + EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error [case 1]: incorrect results!")); } TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations) @@ -388,5 +388,5 @@ TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations) ref_dims, "Error [case 2]: wrong output tensor dimensions!")); EXPECT_TRUE(ck::utils::check_err( - out_tensor.mData, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f)); + out_tensor, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f)); }