mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 03:19:48 +00:00
Rangify constructor of HostTensorDescriptor & Tensor<> (#445)
* Rangify STL algorithms
This commit adapts rangified std::copy(), std::fill() & std::transform()
* Rangify check_err()
By rangifying check_err(), we can not only compare values between
std::vector<>s, but also compare any ranges which have same value
type.
* Allow constructing Tensor<> like a HostTensorDescriptor
* Simplify Tensor<> object construction logics
* Remove more unnecessary 'HostTensorDescriptor' objects
* Re-format example code
* Re-write more HostTensorDescriptor ctor call
[ROCm/composable_kernel commit: 4a2a56c22f]
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
|
||||
std::size_t stride,
|
||||
std::size_t batch_stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, stride, 1}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, 1, stride}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
|
||||
{
|
||||
e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data());
|
||||
|
||||
pass = pass & ck::utils::check_err(e1_g_m_o_device_result.mData,
|
||||
e1_g_m_o_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(e1_g_m_o_device_result, e1_g_m_o_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
|
||||
std::size_t stride,
|
||||
std::size_t batch_stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, stride, 1}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, 1, stride}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
|
||||
{
|
||||
c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
|
||||
|
||||
pass = pass &
|
||||
ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification,
|
||||
std::size_t stride,
|
||||
std::size_t batch_stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, stride, 1}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, 1, stride}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
|
||||
|
||||
pass = pass &
|
||||
ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
|
||||
std::size_t col,
|
||||
std::size_t stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({row * stride, stride, 1}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({col * stride, 1, stride}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
|
||||
|
||||
Tensor<CDataType> c_g_m_n_host_result(
|
||||
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
|
||||
Tensor<ReduceDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
|
||||
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
|
||||
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> d0_g_m_host_result({BatchCount, M});
|
||||
Tensor<ReduceDataType> d1_g_m_host_result({BatchCount, M});
|
||||
|
||||
Tensor<CDataType> c_g_m_n_device_result(
|
||||
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
|
||||
Tensor<ReduceDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
|
||||
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
|
||||
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> d0_g_m_device_result({BatchCount, M});
|
||||
Tensor<ReduceDataType> d1_g_m_device_result({BatchCount, M});
|
||||
|
||||
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
|
||||
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
|
||||
@@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
|
||||
reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
|
||||
reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
|
||||
|
||||
bool c_error =
|
||||
ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData);
|
||||
bool d0_error =
|
||||
ck::utils::check_err(d0_g_m_device_result.mData, d0_g_m_host_result.mData);
|
||||
bool d1_error =
|
||||
ck::utils::check_err(d1_g_m_device_result.mData, d1_g_m_host_result.mData);
|
||||
bool c_error = ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
|
||||
bool d0_error = ck::utils::check_err(d0_g_m_device_result, d0_g_m_host_result);
|
||||
bool d1_error = ck::utils::check_err(d1_g_m_device_result, d1_g_m_host_result);
|
||||
|
||||
pass = pass && (c_error == true);
|
||||
pass = pass && (d0_error == true);
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
|
||||
|
||||
@@ -113,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
|
||||
std::size_t stride,
|
||||
std::size_t batch_stride,
|
||||
auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(std::is_same<decltype(layout), Row>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, stride, 1}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
|
||||
std::vector<std::size_t>({batch_stride, 1, stride}));
|
||||
return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -307,8 +308,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
|
||||
{
|
||||
c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
|
||||
|
||||
pass = pass &
|
||||
ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
|
||||
|
||||
@@ -308,8 +309,8 @@ bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
|
||||
|
||||
pass = pass & ck::utils::check_err(c_gs_ms_os_device_result.mData,
|
||||
c_gs_ms_os_host_result.mData);
|
||||
pass =
|
||||
pass & ck::utils::check_err(c_gs_ms_os_device_result, c_gs_ms_os_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
|
||||
{
|
||||
in_device_buf.FromDevice(input_device_result.mData.data());
|
||||
|
||||
pass =
|
||||
pass & ck::utils::check_err(input_device_result.mData, input_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(input_device_result, input_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
|
||||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
|
||||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
|
||||
}
|
||||
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
|
||||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
|
||||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
|
||||
// bias: assume contiguous 1d vector
|
||||
Tensor<OutDataType> bias_k(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
|
||||
Tensor<OutDataType> bias_k({K});
|
||||
|
||||
// residual: assume same layout as output tensor
|
||||
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
|
||||
out_n_k_ho_wo_host_result.mData);
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
|
||||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
|
||||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
|
||||
}
|
||||
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
|
||||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
|
||||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
|
||||
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
|
||||
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
|
||||
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
|
||||
|
||||
// bias: assume contiguous 1d vector
|
||||
Tensor<OutDataType> bias_k(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
|
||||
Tensor<OutDataType> bias_k({K});
|
||||
|
||||
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
|
||||
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
|
||||
@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
|
||||
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result.mData,
|
||||
out_n_k_ho_wo_host_result.mData);
|
||||
ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
|
||||
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
|
||||
pass = pass & ck::utils::check_err(device_output, host_output);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
|
||||
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
|
||||
}
|
||||
|
||||
success = ck::utils::check_err(input_host_result.mData, input_device_result.mData);
|
||||
success = ck::utils::check_err(input_host_result, input_device_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
|
||||
{
|
||||
wei_device_buf.FromDevice(weights_device_result.mData.data());
|
||||
|
||||
success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);
|
||||
success = ck::utils::check_err(weights_host_result, weights_device_result);
|
||||
|
||||
if(success == false)
|
||||
{
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -68,8 +69,9 @@ bool profile_elementwise_layernorm_impl(int do_verification,
|
||||
std::vector<index_t> gammaBetaStride = {0, 1};
|
||||
|
||||
auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
using namespace ck::literals;
|
||||
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
};
|
||||
|
||||
Tensor<ADataType> a(length);
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
|
||||
{
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
|
||||
// run reference
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<AccDataType> c_m_n(HostTensorDescriptor(
|
||||
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
|
||||
Tensor<AccDataType> c_m_n({M, N});
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
|
||||
BDataType,
|
||||
@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
|
||||
{
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
pass = pass &&
|
||||
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
|
||||
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
|
||||
int StrideD0)
|
||||
{
|
||||
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({len}),
|
||||
std::vector<std::size_t>({stride}));
|
||||
return HostTensorDescriptor({len}, {stride});
|
||||
};
|
||||
|
||||
auto f_host_tensor_descriptor2d =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
|
||||
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
|
||||
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
|
||||
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
|
||||
Tensor<ReduceDataType> reduce0_m_host_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce1_m_host_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce0_m_host_result({M});
|
||||
Tensor<ReduceDataType> reduce1_m_host_result({M});
|
||||
|
||||
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
|
||||
Tensor<ReduceDataType> reduce0_m_device_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce1_m_device_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce0_m_device_result({M});
|
||||
Tensor<ReduceDataType> reduce1_m_device_result({M});
|
||||
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
|
||||
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
|
||||
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
|
||||
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
|
||||
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
|
||||
ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
|
||||
ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
|
||||
ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification,
|
||||
{
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
|
||||
// run reference
|
||||
if(do_verification)
|
||||
{
|
||||
Tensor<AccDataType> c_m_n(HostTensorDescriptor(
|
||||
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
|
||||
Tensor<AccDataType> c_m_n({M, N});
|
||||
|
||||
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
|
||||
BDataType,
|
||||
@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
|
||||
{
|
||||
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
|
||||
|
||||
pass = pass &&
|
||||
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
|
||||
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification,
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
pass =
|
||||
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -75,15 +76,15 @@ bool profile_gemm_reduce_impl(int do_verification,
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -91,16 +92,12 @@ bool profile_gemm_reduce_impl(int do_verification,
|
||||
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
|
||||
|
||||
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
|
||||
Tensor<ReduceDataType> reduce0_m_host_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce1_m_host_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce0_m_host_result({M});
|
||||
Tensor<ReduceDataType> reduce1_m_host_result({M});
|
||||
|
||||
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
|
||||
Tensor<ReduceDataType> reduce0_m_device_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce1_m_device_result(
|
||||
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
|
||||
Tensor<ReduceDataType> reduce0_m_device_result({M});
|
||||
Tensor<ReduceDataType> reduce1_m_device_result({M});
|
||||
|
||||
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
|
||||
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
|
||||
@@ -313,9 +310,9 @@ bool profile_gemm_reduce_impl(int do_verification,
|
||||
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
|
||||
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
|
||||
|
||||
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
|
||||
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
|
||||
ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
|
||||
ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
|
||||
ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -46,15 +47,15 @@ bool profile_gemm_splitk_impl(int do_verification,
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -190,8 +191,7 @@ bool profile_gemm_splitk_impl(int do_verification,
|
||||
{
|
||||
c_device_buf.FromDevice(c_m_n_device_result.mData.data());
|
||||
|
||||
pass =
|
||||
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
|
||||
pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -209,8 +209,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
{
|
||||
wei_device_buf.FromDevice(weight_device_result.mData.data());
|
||||
|
||||
bool pass =
|
||||
ck::utils::check_err(weight_device_result.mData, weight_host_result.mData);
|
||||
bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
|
||||
|
||||
if(!pass)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp"
|
||||
|
||||
#include "ck/library/utility/algorithm.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
@@ -66,7 +67,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
std::array<ck::index_t, NDimSpatial> input_left_pads{};
|
||||
std::array<ck::index_t, NDimSpatial> input_right_pads{};
|
||||
|
||||
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
|
||||
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
|
||||
|
||||
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
|
||||
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
|
||||
@@ -179,7 +180,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
{
|
||||
out_device_buf.FromDevice(device_output.mData.data());
|
||||
|
||||
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
|
||||
pass = pass & ck::utils::check_err(device_output, host_output);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
#include "ck/library/utility/host_tensor_generator.hpp"
|
||||
#include "ck/library/utility/literals.hpp"
|
||||
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -45,15 +46,15 @@ bool profile_grouped_gemm_impl(int do_verification,
|
||||
|
||||
auto f_host_tensor_descriptor =
|
||||
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
|
||||
using namespace ck::literals;
|
||||
|
||||
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({stride, 1}));
|
||||
return HostTensorDescriptor({row, col}, {stride, 1_uz});
|
||||
}
|
||||
else
|
||||
{
|
||||
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
|
||||
std::vector<std::size_t>({1, stride}));
|
||||
return HostTensorDescriptor({row, col}, {1_uz, stride});
|
||||
}
|
||||
};
|
||||
|
||||
@@ -257,8 +258,7 @@ bool profile_grouped_gemm_impl(int do_verification,
|
||||
c_element_op);
|
||||
|
||||
ref_invoker.Run(ref_argument);
|
||||
pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
|
||||
c_m_n_host_result.mData);
|
||||
pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -165,8 +165,7 @@ bool profile_groupnorm_impl(int do_verification,
|
||||
{
|
||||
y_dev.FromDevice(y.mData.data());
|
||||
|
||||
bool pass =
|
||||
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
|
||||
bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
|
||||
|
||||
if(do_log)
|
||||
{
|
||||
|
||||
@@ -411,13 +411,12 @@ bool profile_reduce_impl_impl(bool do_verification,
|
||||
bool single_pass;
|
||||
|
||||
out_dev.FromDevice(out.mData.data());
|
||||
single_pass = ck::utils::check_err(out.mData, out_ref.mData);
|
||||
single_pass = ck::utils::check_err(out, out_ref);
|
||||
|
||||
if(OutputIndex)
|
||||
{
|
||||
out_indices_dev.FromDevice(out_indices.mData.data());
|
||||
single_pass = single_pass &&
|
||||
ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
|
||||
single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
|
||||
};
|
||||
|
||||
if(!single_pass)
|
||||
|
||||
Reference in New Issue
Block a user