[CK][Examples] Fixing stride issues in ck examples by workaround - Bypassing hostTensor validation.

Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>


[ROCm/composable_kernel commit: b9789a0742]
This commit is contained in:
Michal Kulikowski
2025-10-16 13:01:24 +02:00
committed by Michał Kulikowski
parent 2a01918313
commit 31939e7b2b
33 changed files with 195 additions and 199 deletions

View File

@@ -43,8 +43,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -190,11 +191,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -43,8 +43,9 @@ using S = ck::Sequence<Is...>;
using I8 = std::int8_t;
using I32 = std::int32_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -190,11 +191,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -42,8 +42,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -173,7 +174,7 @@ int main(int argc, char* argv[])
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
"beta\n");
exit(0);
exit(1);
}
auto f_host_tensor_descriptor =
@@ -182,11 +183,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -25,8 +25,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -160,23 +161,22 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
ck::index_t StrideD = 0;
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
const auto StrideD = std::is_same<decltype(ELayout{}), ck::tensor_layout::gemm::RowMajor>::value
? d_m_n.mDesc.GetStrides()[0]
: d_m_n.mDesc.GetStrides()[1];
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;

View File

@@ -6,6 +6,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
#endif
using namespace ck::literals;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
ProblemSize ps =
problem_size; // make mutable copy because default stride values of 0 need to be updated
@@ -15,11 +16,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
@@ -43,7 +44,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
// If any user-provided leading stride <= 0, replace it with the one determined by the
// If any user-provided leading stride < 0, replace it with the one determined by the
// created tensor descriptor. For RowMajor the leading stride is index 0, for ColMajor index 1.
auto fetch_leading_stride = [](const auto& tensor, auto layout_tag) -> int {
if constexpr(std::is_same_v<decltype(layout_tag), ck::tensor_layout::gemm::RowMajor>)
@@ -56,15 +57,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
}
};
if(StrideA <= 0)
if(StrideA < 0)
StrideA = fetch_leading_stride(a_m_k, ALayout{});
if(StrideB <= 0)
if(StrideB < 0)
StrideB = fetch_leading_stride(b_k_n, BLayout{});
if(StrideD0 <= 0)
if(StrideD0 < 0)
StrideD0 = fetch_leading_stride(d0_m_n, D0Layout{});
if(StrideD1 <= 0)
if(StrideD1 < 0)
StrideD1 = fetch_leading_stride(d1_m_n, D1Layout{});
if(StrideE <= 0)
if(StrideE < 0)
StrideE = fetch_leading_stride(e_m_n_host_result, ELayout{});
switch(config.init_method)

View File

@@ -25,8 +25,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using ADataType = F16;
using BDataType = F16;
@@ -138,12 +139,12 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(
{batch_count, row, col}, {row * stride, stride, 1_uz}, layout);
{batch_count, row, col}, {row * stride, stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor(
{batch_count, row, col}, {col * stride, 1_uz, stride}, layout);
{batch_count, row, col}, {col * stride, 1_uz, stride}, Bypass{});
}
};

View File

@@ -31,6 +31,7 @@ struct ExecutionConfig final
bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
@@ -62,12 +63,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
}
};

View File

@@ -116,6 +116,7 @@ inline __host__ __device__ constexpr double get_atol()
bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
auto& [M,
N,
@@ -138,12 +139,12 @@ bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
}
};

View File

@@ -37,6 +37,7 @@ struct ExecutionConfig final
bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
auto& [M,
N,
@@ -65,12 +66,12 @@ bool run_batched_gemm_rowwise(const ProblemSize& problem_size, const ExecutionCo
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, layout);
{batch_count_, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor(
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, layout);
{batch_count_, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
}
};

View File

@@ -1,8 +1,10 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
using Bypass = ck::tensor_layout::BypassLayoutVerification;
bool run_batched_gemm_gemm_example(int argc, char* argv[])
{
bool do_verification = true;
@@ -111,12 +113,12 @@ bool run_batched_gemm_gemm_example(int argc, char* argv[])
if(std::is_same<decltype(layout), Row>::value)
{
return HostTensorDescriptor(
{batch_count, row, col}, {batch_stride, stride, 1_uz}, layout);
{batch_count, row, col}, {batch_stride, stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor(
{batch_count, row, col}, {batch_stride, 1_uz, stride}, layout);
{batch_count, row, col}, {batch_stride, 1_uz, stride}, Bypass{});
}
};

View File

@@ -4,27 +4,21 @@
bool run_gemm_add_multiply(const ProblemSize& problem_size, const ExecutionConfig& config)
{
using namespace ck::literals;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
ProblemSize ps =
problem_size; // make mutable copy because default stride values of 0 need to be updated
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = ps;
auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
auto f_host_tensor_descriptor = [](std::size_t row, std::size_t col, int& stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));

View File

@@ -27,8 +27,9 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
@@ -110,11 +111,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -26,8 +26,9 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
@@ -109,11 +110,11 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = BF16;
using AsDataType = ck::Tuple<A0DataType>;
@@ -161,11 +162,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = BF16;
using AsDataType = ck::Tuple<A0DataType>;
@@ -157,11 +158,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -24,7 +24,8 @@ using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
@@ -220,11 +221,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -27,7 +27,8 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = BF16;
using AsDataType = ck::Tuple<A0DataType>;
@@ -160,11 +161,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = BF16;
using AsDataType = ck::Tuple<A0DataType>;
@@ -121,27 +122,19 @@ int main(int argc, char* argv[])
exit(0);
}
auto f_host_tensor_descriptor = [](std::size_t row,
std::size_t col,
ck::index_t& stride,
auto layout) {
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));

View File

@@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = BF16;
using AsDataType = ck::Tuple<A0DataType>;
@@ -121,27 +122,19 @@ int main(int argc, char* argv[])
exit(0);
}
auto f_host_tensor_descriptor = [](std::size_t row,
std::size_t col,
ck::index_t& stride,
auto layout) {
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));

View File

@@ -28,8 +28,9 @@ using BF16 = ck::bhalf_t;
using I8 = int8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = BF16;
using AsDataType = ck::Tuple<A0DataType>;
@@ -120,27 +121,19 @@ int main(int argc, char* argv[])
exit(0);
}
auto f_host_tensor_descriptor = [](std::size_t row,
std::size_t col,
ck::index_t& stride,
auto layout) {
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
auto desc = HostTensorDescriptor({row, col}, {static_cast<std::size_t>(stride), 1_uz});
if(stride <= 0)
stride = desc.GetStrides()[0];
return desc;
}
else
{
auto desc = HostTensorDescriptor({row, col}, {1_uz, static_cast<std::size_t>(stride)});
if(stride <= 0)
stride = desc.GetStrides()[1];
return desc;
}
};
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};
Tensor<A0DataType> a0_m_k(f_host_tensor_descriptor(M, K, StrideA, A0Layout{}));
Tensor<B0DataType> b0_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));

View File

@@ -28,8 +28,9 @@ using F16 = ck::half_t;
using FP8 = ck::f8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = FP8;
using B0DataType = FP8;
@@ -147,11 +148,11 @@ int main(int argc, char* argv[])
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
return HostTensorDescriptor({row, col}, {stride, 1_uz}, Bypass{});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
return HostTensorDescriptor({row, col}, {1_uz, stride}, Bypass{});
}
};

View File

@@ -28,8 +28,9 @@ using F16 = ck::half_t;
using F8 = ck::f8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F8;
using B0DataType = F8;
@@ -242,7 +243,7 @@ int main(int argc, char* argv[])
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 5: N, K, tokens\n");
exit(0);
exit(1);
}
ck::index_t sorted_size = sorted_tile_num * MPerBlock;
@@ -294,7 +295,7 @@ int main(int argc, char* argv[])
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
Tensor<D1DataType> d1_e_n(
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_n_device_result(

View File

@@ -30,8 +30,9 @@ using F8 = ck::f8_t;
using F32 = float;
using I64 = int64_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F8;
using A1DataType = F32;
@@ -312,7 +313,7 @@ int main(int argc, char* argv[])
Col{}));
Tensor<B0DataType> b0_preshuffled(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_n_device_result(

View File

@@ -29,8 +29,9 @@ using F16 = ck::half_t;
using F8 = ck::f8_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F8;
using B0DataType = I4;
@@ -222,7 +223,7 @@ int main(int argc, char* argv[])
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 5: N, K, tokens\n");
exit(0);
exit(1);
}
if(tokens * topk > valid_size)
@@ -268,10 +269,10 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<B0DataType> b0_preshuffled(
HostTensorDescriptor({experts, K, N * 2}, {N * 2 * K, 1, K}, Col{}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
Tensor<D1DataType> d1_e_n(
HostTensorDescriptor({experts, N * 2}, {StrideDs[1] * N * 2, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_n_device_result(

View File

@@ -286,7 +286,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}, Bypass{}));
Tensor<D1DataType> d1_e_n(
HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
e_t_n_device_result.SetZero();

View File

@@ -30,8 +30,9 @@ using F8 = ck::f8_t;
using F32 = float;
using I64 = int64_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F8;
using A1DataType = F32;
@@ -305,7 +306,7 @@ int main(int argc, char* argv[])
Col{}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
e_t_n_device_result.SetZero();

View File

@@ -178,21 +178,17 @@ int main(int argc, char* argv[])
{
// use default case
}
else if(argc == 3)
{
// use default case
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 7)
else if(argc == 3 || argc == 7)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
tokens = std::stoi(argv[6]);
if(argc == 7)
{
N = std::stoi(argv[4]);
K = std::stoi(argv[5]);
tokens = std::stoi(argv[6]);
}
}
else
{
@@ -200,7 +196,7 @@ int main(int argc, char* argv[])
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 6: N, K, tokens\n");
exit(0);
exit(1);
}
ck::index_t StrideA = K;
@@ -244,8 +240,8 @@ int main(int argc, char* argv[])
Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}, Col{}));
Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}, Bypass{}));
Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}, Bypass{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
e_t_n_device_result.SetZero();

View File

@@ -31,8 +31,9 @@ using F32 = float;
using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F4;
using A1DataType = XPackedDataType;
@@ -285,7 +286,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_k_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_k_n_device_result(

View File

@@ -31,8 +31,9 @@ using F32 = float;
using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F4;
using A1DataType = XPackedDataType;
@@ -282,7 +283,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_k_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_k_n_device_result(

View File

@@ -32,8 +32,9 @@ using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
using I64 = int64_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F4;
using A1DataType = XPackedDataType;
@@ -315,7 +316,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N * 2},
{N * 2 * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_k_n_host_result(
HostTensorDescriptor({tokens, topk, N}, {topk * N, N, 1}, Row{}));
Tensor<EDataType> e_t_k_n_device_result(

View File

@@ -31,8 +31,9 @@ using F32 = float;
using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F4;
using A1DataType = XPackedDataType;
@@ -290,7 +291,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));

View File

@@ -31,8 +31,9 @@ using F32 = float;
using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F4;
using A1DataType = XPackedDataType;
@@ -290,7 +291,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));

View File

@@ -32,8 +32,9 @@ using XDataType = ck::e8m0_bexp_t;
using XPackedDataType = int32_t; // 4 packed e8m0_bexp_t
using I64 = int64_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using Bypass = ck::tensor_layout::BypassLayoutVerification;
using A0DataType = F4;
using A1DataType = XPackedDataType;
@@ -325,7 +326,7 @@ int main(int argc, char* argv[])
HostTensorDescriptor({experts, (K + ScaleBlockSize - 1) / ScaleBlockSize, N},
{N * Scale_Stride_BN, 1, Scale_Stride_BN},
Col{}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}, Bypass{}));
Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));